您的位置:首页 > 编程语言

《鲜活的数据-第2章 处理数据》有关代码

2017-09-11 19:02 113 查看

2.1.3 自动收集数据

import urllib2


page = urllib2.urlopen("https://www.wunderground.com/history/airport/ZHCC/2017/9/8/DailyHistory.html")


from BeautifulSoup import BeautifulSoup


soup = BeautifulSoup(page)


images = soup.findAll('img')


first_image = images[0]


print first_image


wxvalue = soup.findAll(attrs={"class":"wx-value"})


print wxvalue


print wxvalue[0]


print wxvalue[0].span.string #AttributeError: 'NoneType' object has no attribute 'string'


print wxvalue[0].contents[0].string


for m in range(1, 13):
for d in range(1, 32):

# Check if already gone through month
if (m == 2 and d > 28):
break
elif (m in [4, 6, 9, 11] and d > 30):
break

# Open wunderground.com url
timestamp = '2016' + str(m) + str(d)
print "Getting data for " + timestamp
#url = "http://www.wunderground.com/history/airport/KBUF/2009/" + str(m) + "/" + str(d) + "/DailyHistory.html"
url = "https://www.wunderground.com/history/airport/ZHCC/2016/" + str(m) + "/" + str(d) + "/DailyHistory.html"
page = urllib2.urlopen(url)

# Get temperature from page
soup = BeautifulSoup(page)
# dayTemp = soup.body.nobr.b.string
dayTemp = soup.findAll(attrs={"class":"wx-value"})[0].contents[0].string

# Format month for timestamp
if len(str(m)) < 2:
mStamp = '0' + str(m)
else:
mStamp = str(m)

# Format day for timestamp
if len(str(d)) < 2:
dStamp = '0' + str(d)
else:
dStamp = str(d)

# Build timestamp
timestamp = '2016' + mStamp + dStamp

# Write timestamp and temperature to file
print timestamp + ',' + dayTemp + '\n'


终端输入并运行文件

python get-weather-data.py

2.2.3 用代码来格式化

1. CSV转为XML

import csv


reader = csv.reader(open('wunder-data.txt', 'r'), delimiter=",")


print '<weather_data>'

for row in reader:
print '<observation>'
print '<date>' + row[0] + '</date>'
print '<temperature>' + row[1] + '</temperature>'
print '</observation>'

print '</weather_data>'


终端输入并运行文件

python csv2xml.py >wunder-data1.xml

f = open('wunder-data.xml', 'w')


f.write('<weather_data>')


for row in reader:
f.write( '<observation>')
f.write( '<date>' + row[0] + '</date>')
f.write( '<temperature>' + row[1] + '</temperature>')
f.write( '</observation>')

f.write( '</weather_data>')


f.close()


2. XML转为CSV

from BeautifulSoup import BeautifulStoneSoup


f = open('wunder-data.xml', 'r')
xml = f.read()
soup = BeautifulStoneSoup(xml)
observations = soup.findAll('observation')
for o in observations:
print o.date.string + "," + o.temperature.string


终端输入并运行文件

python xml2csv.py >wunder-data1.txt

3. CSV转为JSON

import csv


reader = csv.reader(open('wunder-data.txt', 'r'), delimiter=",")


print '{ "observations": ['
rows_so_far = 0
for row in reader:

rows_so_far += 1

print '{'
print '"date": ' + '"' + row[0] + '", '
print '"temperature": ' + row[1]

if rows_so_far < 365:
print " },"
else:
print " }"

print "] }"


终端输入并运行文件

python csv2json.py >wunder-data1.json

4.在循环中加入新的逻辑

import csv


reader = csv.reader(open('wunder-data.txt', 'r'), delimiter=",")


for row in reader:
if int(row[1]) <= 32:
is_freezing = '1'
else:
is_freezing = '0'

print row[0] + "," + row[1] + "," + is_freezing


终端输入并运行文件

python freezingInfo.py >wunder-data-fz.txt


                                            
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  数据可视化