I want to prevent it from being stored in duplicate when I save the value from crawling in jsonYo
The crawler keeps stopping and running again and again and again
When the crawler stops and starts again, you may miss the content, so when you run it again
I made the crawler turn a little later, but if there is a message list with the same content, the same number 2
When the message list is saved, check if the unique number ('1_no' : num) is in the json file.
If there is, I don't want to save it and let it move on, but I don't know what to do.
#message list format
# ID / Unique number / Nickname / Time / Control / Video link
['dlehg1346<!>51297273<!>Bdo*<!>2020-01-19 13:02:02<!>Eating show Younger Brother Guest Honey Jam Eating show Busan <!>http://vod-/v1/video123213_1.smil/playlist.m3u8<?>http://vod-/v1/video123213_2.smil/playlist.m3u8'
, , 'thsej9805<!>51297276<!>the905<!>2020-01-19 13:02:16<!>Little 2<!>http://vod-/v1/video12312321_2.smil/playlist.m3u8',
'kej95<!>51297279<!>hsej<!>2020-01-19 13:03:46<!>Broadcast 2<!>http://vod-/v1/video132321_1.smil/playlist.m3u8']
I created a function that saves the values received in the message list above as a json file.
def savetojson(msgList):
for tosave in msgList:
bjid, num, bjNick, date, title, links = tosave.split('<!>')
links = links.split('<?>')
sv = [{
'1_no' : num,
'2_bj': bjNick,
'3_date' : date,
'4_title': title,
'5_links': links,
}]
with open('data.json', 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
if bjid in dict(data).keys(): # Add BJ ID there if you have one
data[bjid].append(sv[0])
else: # If you don't have it, make it and add it
data[bjid] = sv
json_file.close()
with open('data.json', 'w', encoding='utf-8') as json_file:
json_file.write(json.dumps(data, sort_keys=True, indent=4, ensure_ascii=False))
json_file.close()
Below is the format in which the data.json file is stored.
#BJ is grouped by ID and sv form in it
{
"00000v": [
{
"1_no": "39059686",
"2_bj": "Fat_Fat",
"3_date": "2018-11-29 00:37:00",
"4_title": "[Straw] Lost Ark Sirius Arcana Fostering",
"5_links": [
"http://183.111085_1.mp4/playlist.m3u8"
],
}
],
"01072593019": [
{
"1_no": "39058930",
"2_bj": "Black Tiger",
"3_date": "2018-11-29 00:17:10",
"4_title": "[Birth][Black Tiger] Star Teample",
"5_links": [
"http://125.209.223_209120452_1.mp4/playlist.m3u8",
"http://101.9120452_2.mp4/playlist.m3u8",
],
}
]
}
What's not a good idea is that you want to use the json file with the database?
If you think about it, you can pass the duplicate one and save the new one by BJ, but if you think about it carefully, you can change the title, right? 1_no is unique, but the rest of the values can be changed...What will you do if it's updated?
In this case, it is better to use a separate database.
import os
import json
from collections import namedtuple
SAVE_FILENAME = 'data.json'
msgList = ['dlehg1346<!>51297273<!>Bdo*<!>2020-01-19 13:02:02<!>Eating show Younger Brother Guest Honey Jam Eating show Busan <!>http://vod-/v1/video123213_1.smil/playlist.m3u8<?>http://vod-/v1/video123213_2.smil/playlist.m3u8'
, , 'thsej9805<!>51297276<!>the905<!>2020-01-19 13:02:16<!>Little 2<!>http://vod-/v1/video12312321_2.smil/playlist.m3u8',
'kej95<!>51297272<!>hsej<!>2020-01-19 13:03:46<!>Broadcast 2<!>http://vod-/v1/video132321_1.smil/playlist.m3u8']
BS = namedtuple('BS', 'id, no, bj, date, title, links')
L = [BS(*tosave.split('<!>')) for tosave in msgList]
D = {bs.id:[{'1_no':bs.no, '2_bj':bs.bj, '3_date':bs.date, '4_title':bs.title, '5_links':bs.links.split('<?>')}] for bs in L}
ifos.path.isfile(SAVE_FILENAME): # Update if file exists
with open(SAVE_FILENAME, 'r', encoding='UTF-8') as f:
loaded_json = json.load(f)
exist_bs = [bs for bs in D.items()
if bs[0] in loaded_json.keys()
if bs[1][0]['1_no'] not in (bs['1_no'] for bs in loaded_json[bs[0]])] # bj presence and no are not duplicated
new_bs = [bs for bs in D.items()
if bs[0] not in loaded_json.keys()] # New
Forbs in exist_bs: # Add to existing list because it exists
loaded_json[bs[0]].append(*bs[1])
For bs in new_bs: # Add New
loaded_json[bs[0]] = bs[1]
with open(SAVE_FILENAME, 'w+', encoding='UTF-8') as f:
json.dump(loaded_json, f, ensure_ascii=False)
else: # If the file does not exist, it is new
with open(SAVE_FILENAME, 'w', encoding='UTF-8') as f:
json.dump(D, f, ensure_ascii=False)
© 2024 OneMinuteCode. All rights reserved.