I want to write the sentences written in the tsv file with spaces and store the words in each sentence as nested structural lists.

Asked 2 years ago, Updated 2 years ago, 143 views

I read the tsv file and put the ID and word in each line to make a space.
I don't know the code to list only words in each sentence.
[[['word', 'word', 'word', ['word'...]...]
] in the form
Can someone tell me the code?

 from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

import MeCab
import csv

mt = MeCab.Tagger()

reports = [ ]

with open("tfidf.tsv", mode='r', encoding='utf-8') asf:
    # Reports.tsv contains word-of-mouth IDs and word-of-mouth separated by tab.
    reader=csv.reader(f, delimiter="\t")
    for report_id, report in reader:
        words = [ ]
        node=mt.parseToNode(report)
        while node:
                if node.feature.split(",")[0]==u "noun":
                        words.append(node.surface)
                elif node.feature.split(",")[0]==u "adjective":
                        words.append(node.feature.split(",")[6])
                whale lice. featurename Sensinode Second Life. the splitmethod (" , " ) [ 0 ]   =   " verb " :
                        words.append(node.feature.split(",")[6])
                node = node.next

        stopword = ['れる','の','られる','せる','させる','LED','pantone','20','30','β','3','?','9','(','\u3000','&','A','W','%','×','"','6','15','1994','3',"'",'GW','25','OO','s','1','/','500','%','.','0','2020','V','13','5','180','-', '%)','(','+','4','8','10','100','14','ABS','4','K','"','7','50','.','33','5','上','器','丿','・','0','40','7', '?',')','60','+','2','/','、','。','「','」','2','1000','1010123','1093','12','120','16','17','1930','1972','200','2002','21','24','260','27','34','35','36','360','37','40000','4136','43','480','70','75','agata','back','ball','basic','cd','chadwick','chair','co','crt','daa','degrees','dm','dvd','eco','epd','eva','fsc','gulfman','homearchi','hybrid','ic','ipad','ipod','lan','leavbes','led','may','md','mdf','mf','mm','mmx','ms','nanoe','nasa','nc','no','oa','off','ohashi','ojigi','on','pantone','pc','pet','pp','prism','pro','rom','sd','shock','soho','td','the','to','tpo','trek','usb','ventura','way','wear','av','diy','led','led','pc','1つ','2つ','そう','どこ','ここ','これ','この','いつ','こんな','いくつ','これら','どの','その','それぞれ','そのもの','こうした','それ',' sore nari ', ' kochira ', ' one ', ' kou iu ', ' sou iu ', ' In these ta ', ' dou ', ' dochira ', ' dore ', ' how ', ' No matter how ', ' mono ', ' koto ', ' tokoro ', ' you ', ' tame ', ' yellowtail ', ' often ', ' bulk ', ' tightly-closed ', ' tou ', ' degree ', ' toki ', ' koto ',       ,       ,       ,       ,       ,       ,       ,       ,       ,       ,       ,       ,       ,える','かんがえる','わかる','分かる','分る','見える','みえる','知る','しる','知れる','しれる','言える','いえる','示す','しめす','述べる','のべる','書く','かく','よる','だす','出す','入る','はいる','いれる','入れる','使う','つかう','用いる','もちいる','持つ','もつ','もてる','持てる','作る','つくる','なす','起こる','おこる','つく','つける','付く','付ける','聞く','よぶ','きく','呼ぶ','ない','高い','低い','多い','少ない','強い','大きい','小さい','長い','ながい','よい','良い','悪い','いい','やすい','にくい','うWell bad for ' , ' design ' , ' ' , ' kinou ' , ' goods ' , ' products '] like ' , '
        w ord s2   =     [ Glen   for Jyrki Juusela kassebaum     Windows, XSetWindowBackgroundPixmap  , see   if Your Facebook up its now t      , see it was time to the Username ] # Ford difference is that The style ID tag, a list of words to the text
        reports.append(TaggedDocument(words=words2, tags=[report_id]))

python mecab natural-language-processing

2022-09-30 19:39

1 Answers

I don't know exactly what you're looking for, so I'm going to give you a guess.
If you want to create a separate list of words with [['word', 'word', 'word'], ['word'...]...], you can create it by adding the word created in the for statement to another list.

The sample code below leaves report_id as the key to the dictionary (wordDic) and allows you to write both the question text code and the list of questions.

 from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

import MeCab
import csv

mt = MeCab.Tagger()

reports=[ ]# List of TaggedDocuments (remain original question)
wordDic={}#List of report_id for each line and only words per sentence

# Word of mouth ID and word of mouth separated by tab for each line
tsv=["1\tStart this LED with usb", "When did 2\tnasa and jaxa make ohashi", "3\t That ojigi's degrees are too big"]
reader=csv.reader(tsv, delimiter="\t")
for report_id, report in reader:
    words = [ ]
    node=mt.parseToNode(report)
    while node:
        if node.feature.split(",")[0]==u "noun":
            words.append(node.surface)
        elif node.feature.split(",")[0]==u "adjective":
            words.append(node.feature.split(",")[6])
        whale lice. featurename Sensinode Second Life. the splitmethod (" , " ) [ 0 ]   =   " verb " :
            words.append(node.feature.split(",")[6])
        node = node.next

    wordDic [report_id] = words

stopword = ['れる','の','られる','せる','させる','LED','pantone','20','30','β','3','?','9','(','\u3000','&','A','W','%','×','"','6','15','1994','3',"'",'GW','25','OO','s','1','/','500','%','.','0','2020','V','13','5','180','-', '%)','(','+','4','8','10','100','14','ABS','4','K','"','7','50','.','33','5','上','器','丿','・','0','40','7', '?',')','60','+','2','/','、','。','「','」','2','1000','1010123','1093','12','120','16','17','1930','1972','200','2002','21','24','260','27','34','35','36','360','37','40000','4136','43','480','70','75','agata','back','ball','basic','cd','chadwick','chair','co','crt','daa','degrees','dm','dvd','eco','epd','eva','fsc','gulfman','homearchi','hybrid','ic','ipad','ipod','lan','leavbes','led','may','md','mdf','mf','mm','mmx','ms','nanoe','nasa','nc','no','oa','off','ohashi','ojigi','on','pantone','pc','pet','pp','prism','pro','rom','sd','shock','soho','td','the','to','tpo','trek','usb','ventura','way','wear','av','diy','led','led','pc','1つ','2つ','そう','どこ','ここ','これ','この','いつ','こんな','いくつ','これら','どの','その','それぞれ','そのもの','こうした','それ',' sore nari ', ' kochira ', ' one ', ' kou iu ', ' sou iu ', ' In these ta ', ' dou ', ' dochira ', ' dore ', ' how ', ' No matter how ', ' mono ', ' koto ', ' tokoro ', ' you ', ' tame ', ' yellowtail ', ' often ', ' bulk ', ' tightly-closed ', ' tou ', ' degree ', ' toki ', ' koto ',       ,       ,       ,       ,       ,       ,       ,       ,       ,       ,       ,       ,       ,える','かんがえる','わかる','分かる','分る','見える','みえる','知る','しる','知れる','しれる','言える','いえる','示す','しめす','述べる','のべる','書く','かく','よる','だす','出す','入る','はいる','いれる','入れる','使う','つかう','用いる','もちいる','持つ','もつ','もてる','持てる','作る','つくる','なす','起こる','おこる','つく','つける','付く','付ける','聞く','よぶ','きく','呼ぶ','ない','高い','低い','多い','少ない','強い','大きい','小さい','長い','ながい','よい','良い','悪い','いい','やすい','にくい','うWell bad for ' , ' design ' , ' ' , ' kinou ' , ' goods ' , ' products '] like ' , '
for report_id in wordDic:
    print(wordDic[report_id])
    words2 = [ token for token in wordDic [ report_id ] if token not in stopword ]
    reports.append(TaggedDocument(words=words2, tags=[report_id]))

print(reports)
# a code that lists only words for each sentence
print(list(wordDic.values()))
# a code that lists only words 2 for each sentence by sentence
print([report.words for report in reports])
['LED', 'usb', 'Start', 'Do', 'Let']
['nasa', 'jaxa', 'is', 'ohashi', 'make', 'of']
["ojigi", "degrees", "big", "too much"]
[TaggedDocument (words=['start'], tags=['1'], TaggedDocument (words=['jaxa'], tags=['2'], TaggedDocument (words=[], tags=['3']]]]
[[['LED', 'usb', 'start', 'do', 'let'], ['nasa', 'jaxa', 'is', 'ohashi', 'make', 'of', 'ojigi', 'degrees', 'big', 'too big']]]]
[[['Start', 'jaxa', []]


2022-09-30 19:39

If you have any answers or tips


© 2024 OneMinuteCode. All rights reserved.