I read the tsv file and put the ID and word in each line to make a space.
I don't know the code to list only words in each sentence.
[[['word', 'word', 'word', ['word'...]...]
]
in the form
Can someone tell me the code?
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import MeCab
import csv
mt = MeCab.Tagger()
reports = [ ]
with open("tfidf.tsv", mode='r', encoding='utf-8') asf:
# Reports.tsv contains word-of-mouth IDs and word-of-mouth separated by tab.
reader=csv.reader(f, delimiter="\t")
for report_id, report in reader:
words = [ ]
node=mt.parseToNode(report)
while node:
if node.feature.split(",")[0]==u "noun":
words.append(node.surface)
elif node.feature.split(",")[0]==u "adjective":
words.append(node.feature.split(",")[6])
whale lice. featurename Sensinode Second Life. the splitmethod (" , " ) [ 0 ] = " verb " :
words.append(node.feature.split(",")[6])
node = node.next
stopword = ['れる','の','られる','せる','させる','LED','pantone','20','30','β','3','?','9','(','\u3000','&','A','W','%','×','"','6','15','1994','3',"'",'GW','25','OO','s','1','/','500','%','.','0','2020','V','13','5','180','-', '%)','(','+','4','8','10','100','14','ABS','4','K','"','7','50','.','33','5','上','器','丿','・','0','40','7', '?',')','60','+','2','/','、','。','「','」','2','1000','1010123','1093','12','120','16','17','1930','1972','200','2002','21','24','260','27','34','35','36','360','37','40000','4136','43','480','70','75','agata','back','ball','basic','cd','chadwick','chair','co','crt','daa','degrees','dm','dvd','eco','epd','eva','fsc','gulfman','homearchi','hybrid','ic','ipad','ipod','lan','leavbes','led','may','md','mdf','mf','mm','mmx','ms','nanoe','nasa','nc','no','oa','off','ohashi','ojigi','on','pantone','pc','pet','pp','prism','pro','rom','sd','shock','soho','td','the','to','tpo','trek','usb','ventura','way','wear','av','diy','led','led','pc','1つ','2つ','そう','どこ','ここ','これ','この','いつ','こんな','いくつ','これら','どの','その','それぞれ','そのもの','こうした','それ',' sore nari ', ' kochira ', ' one ', ' kou iu ', ' sou iu ', ' In these ta ', ' dou ', ' dochira ', ' dore ', ' how ', ' No matter how ', ' mono ', ' koto ', ' tokoro ', ' you ', ' tame ', ' yellowtail ', ' often ', ' bulk ', ' tightly-closed ', ' tou ', ' degree ', ' toki ', ' koto ', , , , , , , , , , , , , ,える','かんがえる','わかる','分かる','分る','見える','みえる','知る','しる','知れる','しれる','言える','いえる','示す','しめす','述べる','のべる','書く','かく','よる','だす','出す','入る','はいる','いれる','入れる','使う','つかう','用いる','もちいる','持つ','もつ','もてる','持てる','作る','つくる','なす','起こる','おこる','つく','つける','付く','付ける','聞く','よぶ','きく','呼ぶ','ない','高い','低い','多い','少ない','強い','大きい','小さい','長い','ながい','よい','良い','悪い','いい','やすい','にくい','うWell bad for ' , ' design ' , ' ' , ' kinou ' , ' goods ' , ' products '] like ' , '
w ord s2 = [ Glen for Jyrki Juusela kassebaum Windows, XSetWindowBackgroundPixmap , see if Your Facebook up its now t , see it was time to the Username ] # Ford difference is that The style ID tag, a list of words to the text
reports.append(TaggedDocument(words=words2, tags=[report_id]))
I don't know exactly what you're looking for, so I'm going to give you a guess.
If you want to create a separate list of words with [['word', 'word', 'word'], ['word'...]...]
, you can create it by adding the word
created in the for statement to another list.
The sample code below leaves report_id
as the key to the dictionary (wordDic
) and allows you to write both the question text code and the list of questions.
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import MeCab
import csv
mt = MeCab.Tagger()
reports=[ ]# List of TaggedDocuments (remain original question)
wordDic={}#List of report_id for each line and only words per sentence
# Word of mouth ID and word of mouth separated by tab for each line
tsv=["1\tStart this LED with usb", "When did 2\tnasa and jaxa make ohashi", "3\t That ojigi's degrees are too big"]
reader=csv.reader(tsv, delimiter="\t")
for report_id, report in reader:
words = [ ]
node=mt.parseToNode(report)
while node:
if node.feature.split(",")[0]==u "noun":
words.append(node.surface)
elif node.feature.split(",")[0]==u "adjective":
words.append(node.feature.split(",")[6])
whale lice. featurename Sensinode Second Life. the splitmethod (" , " ) [ 0 ] = " verb " :
words.append(node.feature.split(",")[6])
node = node.next
wordDic [report_id] = words
stopword = ['れる','の','られる','せる','させる','LED','pantone','20','30','β','3','?','9','(','\u3000','&','A','W','%','×','"','6','15','1994','3',"'",'GW','25','OO','s','1','/','500','%','.','0','2020','V','13','5','180','-', '%)','(','+','4','8','10','100','14','ABS','4','K','"','7','50','.','33','5','上','器','丿','・','0','40','7', '?',')','60','+','2','/','、','。','「','」','2','1000','1010123','1093','12','120','16','17','1930','1972','200','2002','21','24','260','27','34','35','36','360','37','40000','4136','43','480','70','75','agata','back','ball','basic','cd','chadwick','chair','co','crt','daa','degrees','dm','dvd','eco','epd','eva','fsc','gulfman','homearchi','hybrid','ic','ipad','ipod','lan','leavbes','led','may','md','mdf','mf','mm','mmx','ms','nanoe','nasa','nc','no','oa','off','ohashi','ojigi','on','pantone','pc','pet','pp','prism','pro','rom','sd','shock','soho','td','the','to','tpo','trek','usb','ventura','way','wear','av','diy','led','led','pc','1つ','2つ','そう','どこ','ここ','これ','この','いつ','こんな','いくつ','これら','どの','その','それぞれ','そのもの','こうした','それ',' sore nari ', ' kochira ', ' one ', ' kou iu ', ' sou iu ', ' In these ta ', ' dou ', ' dochira ', ' dore ', ' how ', ' No matter how ', ' mono ', ' koto ', ' tokoro ', ' you ', ' tame ', ' yellowtail ', ' often ', ' bulk ', ' tightly-closed ', ' tou ', ' degree ', ' toki ', ' koto ', , , , , , , , , , , , , ,える','かんがえる','わかる','分かる','分る','見える','みえる','知る','しる','知れる','しれる','言える','いえる','示す','しめす','述べる','のべる','書く','かく','よる','だす','出す','入る','はいる','いれる','入れる','使う','つかう','用いる','もちいる','持つ','もつ','もてる','持てる','作る','つくる','なす','起こる','おこる','つく','つける','付く','付ける','聞く','よぶ','きく','呼ぶ','ない','高い','低い','多い','少ない','強い','大きい','小さい','長い','ながい','よい','良い','悪い','いい','やすい','にくい','うWell bad for ' , ' design ' , ' ' , ' kinou ' , ' goods ' , ' products '] like ' , '
for report_id in wordDic:
print(wordDic[report_id])
words2 = [ token for token in wordDic [ report_id ] if token not in stopword ]
reports.append(TaggedDocument(words=words2, tags=[report_id]))
print(reports)
# a code that lists only words for each sentence
print(list(wordDic.values()))
# a code that lists only words 2 for each sentence by sentence
print([report.words for report in reports])
['LED', 'usb', 'Start', 'Do', 'Let']
['nasa', 'jaxa', 'is', 'ohashi', 'make', 'of']
["ojigi", "degrees", "big", "too much"]
[TaggedDocument (words=['start'], tags=['1'], TaggedDocument (words=['jaxa'], tags=['2'], TaggedDocument (words=[], tags=['3']]]]
[[['LED', 'usb', 'start', 'do', 'let'], ['nasa', 'jaxa', 'is', 'ohashi', 'make', 'of', 'ojigi', 'degrees', 'big', 'too big']]]]
[[['Start', 'jaxa', []]
© 2024 OneMinuteCode. All rights reserved.