Thank you for your help, I'm a python beginner. I'm scraping horse racing data using python 2.7, beautiful group, but I'm having a hard time.
I can print only one line, but I don't know which script to script 20 lines and which one to print to CSV.
Unable to save to csv
20 lines cannot be scraped together
# -*- coding:utf-8 -*-
import urllib2
import codecs
from bs4 import BeautifulSoup
f = codecs.open('horse.csv', 'w', 'utf-8')
f.write('race_date,kaisai,weather,race_number,race_name,tousuu,frame_number,horse_number,single_win_ratio,popularity,horse_arrival,horse_name,weight,distance,baba, race_time,difference,horse_pass,pace,nobori,horse_weight,win_horse,prize_money'+ u"\n")
tpl_url='http://db.netkeiba.com/?pid=jockey_detail&id=00663&page={0}'
for i in xrange(1, 5):
url=tpl_url.format( i )
soup = BeautifulSoup(urllib2.urlopen(url).read(),"lxml")
tr_arr = soup.find('div', {'id':'contents_liquid'}).findAll('tbody')
for tr in tr_arr:
# B_data Eich date #
lrg1= tr.findAll('td')[0].string
# Facebook, Nokia, holding #
lrg2= tr.findAll('td')[1].string
# Weather #weather
lrg3 = tr.findAll('td')[2].string
#th ○race #race_number
lrg4 = tr.findAll('td')[3].string
# Race name #race_name
lrg5 = tr.findAll('td')[4].string
# Video (I want to delete this part)
lrg6 = tr.findAll('td')[5].string
# Number of heads #tousuuu
lrg7 = tr.findAll('td')[6].string
# frame number # frame_number
lrg8 = tr.findAll('td')[7].string
# Horseman #horse_number
lrg9 = tr.findAll('td')[8].string
#Single_win_racio
lrg10 = tr.findAll('td')[9].string
#Popularity
lrg11 = tr.findAll('td')[10].string
#Order of arrival #horse_arrival
lrg12 = tr.findAll('td')[11].string
# horse name #horse_name
lrg13 = tr.findAll('td', {'class':'txt_l'})[1]
# catty #weight
lrg14 = tr.findAll('td')[13].string
# Distance # distance
lrg15 = tr.findAll('td')[14].string
# Baba #baba
lrg16 = tr.findAll('td')[15].string
# time #race_time
lrg17 = tr.findAll('td')[16].string
# difference #difference
lrg18 = tr.findAll('td')[17].string
# Pass #horse_pass
lrg19 = tr.findAll('td')[18].string
#pace #pace
lrg20 = tr.findAll('td')[19].string
# Up #nobori
lrg21 = tr.findAll('td')[20].string
#Horse weight #horse_weight
lrg22 = tr.findAll('td')[21].string
#Winning horse #win_horse
lrg23 = tr.findAll('td', {'class':'txt_l'})[2]
#Prize money #prize_money
lrg24 = tr.findAll('td')[23].string
print lrg1, lrg2, lrg3, lrg4, lrg5, lrg6, lrg7, lrg8, lrg9, lrg10, \
lrg11, lrg12, lrg13.a.string, lrg14, lrg15, lrg16, lrg17, \
lrg18, lrg19, lrg20, lrg21, lrg22, lrg23.a.string, lrg24
f.close()
I tried looping it with for, but it didn't work because of lack of skills.
python 2.7
It is a program that uses findAll a lot and is made by force.
Dear seniors, please let me know.
There are 20 rows in the table, but only one row can be retrieved.
soup.find('div', {'id':'contents_liquid'}).findAll('tbody')
Since there is only one tbody
, the for loop ran only once, and each tr
must be removed from the tbody
.In other words,
soup.find('div', {'id':'contents_liquid'}).find('tbody').findAll('tr')
Note that tr.findAll('td')[0].string
is to retrieve objects of type NavigableString
.You should then write to CSV and retrieve the string with tr.findAll('td')[0].text
.
See here for more information on using csv with Python.Also, to write Unicode on the CSV, you can .encode('utf-8')
each unicode string.Also, I think it's better not to use the with
statement to .close()
.
I tried to modify the code
import urlib2
import codecs
from bs4 import BeautifulSoup
import csv
# Removed f.close() using the with statement
with open('horse.csv', 'w') as f:
writer=csv.writer(f)
writer.writerow(['race_date', 'kaisai', 'weather', #...
])
tpl_url='http://db.netkeiba.com/?pid=jockey_detail&id=00663&page={0}'
for i in xrange(1, 5):
url=tpl_url.format( i )
soup = BeautifulSoup(urllib2.urlopen(url).read(),"lxml")
tr_arr=soup.find('div', {'id':'contents_liquid'}).find('tbody').findAll('tr')
for tr in tr_arr:
tds = tr.findAll('td')
lrg = [ ]
for index, td in enumerate (tr. findAll('td')):
# Understanding Loop Improvements: Some td's are specially treated
if index==12:
lrg.append(tr.findAll('td', {'class':'txt_l'})[1].text)
elif index==22:
lrg.append(tr.findAll('td', {'class':'txt_l'})[2].text)
else:
# put other ordinary td's on the list as they are
lrg.append(td.text)
writer.writerow ([s.trip().encode('utf-8') for s in lrg])
f.close()
will not be able to write a continuation, and the second line or later will not be printed.
Remove f.close()
.
© 2024 OneMinuteCode. All rights reserved.