Hello, everyone Natural language processing (using khaiiii) was performed through Python to derive the following results.
import numpy as np
import pandas as pd
from khaiii import KhaiiiApi
api = KhaiiiApi()
df = pd.read_csv('president.csv', encoding = 'utf-8')
df = pd.DataFrame(df)
df['Content'] = df['Content'].str.replace(",", "")
split = df.Content.str.split (".")
split = split.apply(lambda x: pd.Series(x))
split = split.stack().reset_index(level=1, drop=True).to_frame('sentences')
df = df.merge(split, left_index=True, right_index=True, how='left')
df = df.drop (['Content'], Axis = 1)
df['sentences'].replace('', np.nan, inplace= True)
df['sentences'].replace(' ', np.nan, inplace= True)
df.dropna(subset=['sentences'], inplace=True)
df = df.reset_index(drop=True)
nlp = df.sentences.apply(lambda x: [str(w).split('\t')[1] for w in api.analyze(x)])
What I want to do is to extract only the words tagged with nng, vv, and va in the nlp column at the back, but I am asking you a question because it is still difficult to use Python.
For example, by the first column of nlp,
[Young (C), dirty, small and medium-sized enterprises, president, Ha (C)]
I'd like to refine it in the form of.
Additionally, I would like to add 'Da' to the back of vv and va to make it a complete form, so I would appreciate it if you could tell me how.
pandas filtering nlp
I'm doing both review and study, but I don't know what I'm going to do for the questioner.
This is a sample using Word2vec, which is used a lot for embedding.
Themodel skip gram of.
import itertools as it
import pandas as pd
from khaiii import KhaiiiApi
df = pd.read_csv('https://drive.google.com/u/0/uc?id=1IZ1NYJmbabv6Xo7WJeqRcDFl1Z5pumni&export=download')
df.drop (columns=['NO', 'President', 'Date'], inplace=True)
api = KhaiiiApi()
def parse(sentence):
pos = ((morph.lex, morph.tag) for word in api.analyze(sentence) for morph in word.morphs if morph.tag in ['NNG', 'VV', 'VA']) # only nng, vv, va
words = [item[0] if item[1] == 'NNG' else f' {item[0]} for item in pos] #appendsuffix
return words
df['nlp'] = df['content'].apply(parse)
from gensim.models.word2vec import Word2Vec
model = Word2Vec(df['nlp'].values, sg=1, window=5, min_count=1, workers=4, iter=100)
model.init_sims(replace=True)
print(model.wv.similarity('young', 'president')
print(model.wv.similarity('young', 'small business')
# Frequency number
from collections import Counter
df['cnt'] = df['nlp'].apply(lambda words:dict(Counter(words)))
I didn't upload the president.csv file, and khaiiii is not currently installed on the equipment, so I'll try konlpy.
Also, if it's a csv file like above, it's better to extract only 'content' and handle one text.
https://drive.google.com/open?id=1_wuhj7bzI10iekt_uW1_tEOGXEJY2fVR
import itertools as it
from konlpy.tag import Kkma
With open ('Moon Jae-in's inauguration speech.txt', encoding='euc-kr') as f:
contents = f.read().splitlines()[0]
kkma = Kkma()
morph = kkma.pos(content) # Morphological analysis
groups = {word: set(i[0] for i in tag) for word, tag in it.groupby(sorted(morph, key=lambda item:item[1]), lambda item:item[1])} # Classified by part company (deduplicated)
print(groups['NNG'])
# {'First Step', 'Je', 'People', 'Head', 'Experience', 'Passion', 'Thank you', 'Today', 'Respect', 'Shoulder', 'World', 'Coexistence', 'Choice', 'Korea', 'Once', 'Love', 'Heart', 'Unification', 'Blueprint', 'Country', 'Calling'}
print(groups['VV'])
# {'Cha', 'Heading', 'Sook', 'Nedid', 'Give', 'Dree', 'Made', 'Yeol', 'Gal'}
print ([f'{word}da' for word in groups['VV'])
# ["Cold", "Heading", "Bow", "Take", "Give", "Make", "Open", "Grinding"]
No index removal or column name changes were made.
import itertools as it
import pandas as pd
from konlpy.tag import Kkma
df = pd.read_csv('https://drive.google.com/u/0/uc?id=1IZ1NYJmbabv6Xo7WJeqRcDFl1Z5pumni&export=download')
df.drop (columns=['NO', 'President', 'Date'], inplace=True)
kkma = Kkma()
def parse(sentence):
groups = {word: set(i[0] for i in tag)
for word, tag in it.groupby(sorted(kkma.pos(sentence), key=lambda item:item[1]), lambda item:item[1])} # Classified by part company (deduplicated)
return[*groups.get('NNG', []), *[f'{word}da' for word in groups.get('V', [])], *[f'{word}da' for word in groups.get('VA', []]]]]]]]
df['nlp'] = df['content'].apply(parse)
print(df['nlp'][0])
["Newspaper", "Early", "Additional", "Previous Day", "Competitive", "Production", "Question", "Dollar", "Government", "Financial", "Export", "Small", "Small", "Small", "Small", "Small", "Loan", "Work", "Early", "Value", "Now", "Pee", "Corporate", "Operation", "Production", "Production", "Production", "Production", "Problem", "Production", "Site", "Promotion", "Production", "Production", "Small"', "Unemployed", "Century", "Fat", "Fund", "Increase", "Support", "Nam", "Credit", "Technology", "Efforts", Interest Rate", "Budget", "Unemployment", "Proper", "Time", "Proper", "Executive", "Executive", "Assurance", "Company Bond", "Depreciation", "Large", "Work", "Hal", "Hal", "Header", "Ready", "Head", "Ready", "Header", "Header", "Header", "Header", "Ready", " "Expensive", "Look", "Good", "Sad", "How is it", "Young", "Same", "Like"]
I will upload it after checking the khaiiii installed equipment. (Two lines have been changed)
import itertools as it
import pandas as pd
from khaiii import KhaiiiApi
df = pd.read_csv('https://drive.google.com/u/0/uc?id=1IZ1NYJmbabv6Xo7WJeqRcDFl1Z5pumni&export=download')
df.drop (columns=['NO', 'President', 'Date'], inplace=True)
api = KhaiiiApi()
def parse(sentence):
pos = ((morph.lex, morph.tag) for word in api.analyze(sentence) for morph in word.morphs)
groups = {word: set(i[0] for i in tag)
for word, tag in it.groupby(sorted(pos, key=lambda item:item[1]), lambda item:item[1])} # Classified by part company (deduplicated)
return[*groups.get('NNG', []), *[f'{word}da' for word in groups.get('V', [])], *[f'{word}da' for word in groups.get('VA', []]]]]]]]
df['nlp'] = df['content'].apply(parse)
print(df['nlp'][0])
["Job", "Small Amount", "Future", "Fund", "Production", "Use", "Shipment", "Our Country", "Massage", "Recent", "Execution", "Important", "Corporate", "Loan", "Exportation", "Financial", "Plan", "Additional", "Country", "Site", "Creduction", "Technical", "Invent", "Earge", "Earge", "Code", "Propathy", "Propathy", "Code", "Code", "Earge", "Earge", "Code", "U "Efforts", "Work", "Problem", "Problem", "Budget", "Past", "Work", "Mal", "Provision", "Small and Medium Business", "Bankruptcy", "First Line", "Money", "Talk", "Time", "Extension", "Calculation", "Export", "Time", "Evening", "Present", "Guarantee", "increase", "Go", "Go", "Go", "Go", "Go", "Go", "Go", "Go", "Go", "Go", "Go", "Go", "Go", "Go", "Go", "Go", "Go", "Go",
© 2024 OneMinuteCode. All rights reserved.