import os
import pandas as pd
def searchFiles(path):
filelist = []
filenames = os.listdir(path)
for filename in filenames:
file_path = os.path.join(path, filename)
filelist.append(file_path)
return filelist
def main():
reviews = []
for filePath in searchFiles('./Reviews/'):
review = pd.read_csv(filePath, encoding = 'utf-8')
reviews.append(review)
docs = pd.concat(reviews, ignore_index=True)
print(docs)
if __name__=='__main__':
main()
Currently, I made a csv file using Google Store review as crawl The csv file and the data frame succeeded with the above code. We have now run the following code to build a document-word matrix for words that have appeared more than once in the entire document set.
import MeCab
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import matplotlib.pyplot as plt
def searchFiles(path):
filelist = []
filenames = os.listdir(path)
for filename in filenames:
file_path = os.path.join(path, filename)
filelist.append(file_path)
return filelist
def getNVM_lemma(text):
tokenizer = MeCab.Tagger()
parsed = tokenizer.parse(text)
word_tag = [w for w in parsed.split("\n")]
pos = []
tags = ['NNG','NNP','VV','VA','VX','VCP','VCN', 'MAG']
for word_ in word_tag[:-2]:
word = word_.split("\t")
tag = word[1].split(",")
if(len(word[0]) < 2):
continue
if(tag[-1] != '*'):
t = tag[-1].split('/')
if(len(t[0])>1 and ('VV' in t[1] or 'VA' in t[1] or 'VX' in t[1])):
pos.append(t[0])
else:
if(tag[0] in tags):
pos.append(word[0])
return pos
def main():
reviews = []
for filePath in searchFiles('./Reviews/'):
review = pd.read_csv(filePath, encoding = 'utf-8')
reviews.append(review)
docs = pd.concat(reviews, ignore_index=True)
tf_vect = CountVectorizer(tokenizer=getNVM_lemma, min_df=2)
dtm = tf_vect.fit_transform (docs['content'])
if __name__ == '__main__':
main()
At this time
Traceback (most recent call last):
File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 71, in <module>
main()
File "C:/Users/chlwn/PycharmProjects/untitled1/cos pro 2/2.py", line 68, in main
dtm = tf_vect.fit_transform (docs['content'])
File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1199, in fit_transform
self.fixed_vocabulary_)
File "C:\Users\chlwn\PycharmProjects\untitled1\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1110, in _count_vocab
for feature in analyze(doc):
TypeError: 'NoneType' object is not iterable
Process finished with exit code 1
What should I do if an error like this occurs?
mecab nlp sklearn
I guess it's the most likely cause and write an answer.
Looking back, the above answer does not match the error message. I said analyze was none, because I didn't complain that doc was none...
I watched it again.
The last return pos
indent is also invalid in the getNVM_lemma
function. It's given as a function of splitting words in a bunch of text. However, in the for loop, only the continue
condition goes straight, and when you leave the for door, there is no return what
at the end, so the function returns None
by default. If I had used something like a pilint, I would have told you.
This function is probably responsible for one step processing within the analyze
function, and if None
returns analyze
, analyze
returns None
and it matches the error message.
© 2024 OneMinuteCode. All rights reserved.