数据保存!!!Python 爬取网页数据后,三种保存格式---保存为txt文件、CSV文件和mysql数据库
2017-09-07 11:55
1311 查看
Python爬取网站数据后,数据的保存方式是大家比较关心的意一件事情,也是为接下来是否能够更简便的处理数据的关键步骤。下面,就Python爬取网页数据后的保存格式进行简单介绍。三种保存格式为txt格式、CSV格式和数据库格式。
首先,保存为txt格式。话不多说,直接上代码!
# -*- coding: utf-8 -*-
import requests
import json
import html
import urllib
import sys
import re
import random
import time
from threading import Timer
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf-8')
headers ={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'}
def get_html1(i):
url = 'https://www.ppmoney.com/StepUp/List/-1/{}/fixedterm/true/false?_={}'
html = requests.get(url.format(i,random.randint(1501050773102,1501051774102)),headers=headers)
return html.content
def get_data1(html):
data1 = json.loads(html)
data = data1['PackageList']['Data']
for i in data:
#产品名称,利率,金额
print i['name'],'\t',i['profit'],'\t',i['investedMoney']
with open('d:PPmonenyshengxinbao9.6.txt','a') as f:
f.write(i['name']+'\t'+str(i['profit'])+'\t'+str(i['investedMoney'])+'\n'
for i in range(1,10):
get_data1(get_html1(i))
执行代码后,生成文件打开后显示如下:
2.保存为CSV格式。
# -*- coding: utf-8 -*-
import requests
import pandas as pd
import numpy as np
import json
import html
import urllib
import sys
import re
import random
import time
from threading import Timer
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
headers ={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'}
def get_html1(i):
url = 'https://www.ppmoney.com/StepUp/List/-1/{}/fixedterm/true/false?_={}'
html = requests.get(url.format(i,random.randint(1501050773102,1501051774102)),headers=headers)
ceshi1=html.content
data = json.loads(ceshi1)
return(data['PackageList']['Data'])
data_ceshi=pd.DataFrame([])
html_list=[]
for i in range(100):
html_list.append(get_html1(i))
for i,heml_avg in enumerate(html_list):
tmp=pd.DataFrame(heml_avg)
tmp["page_id"]=i
data_ceshi=data_ceshi.append(tmp)
print data_ceshi
data_ceshi.to_csv('e:/data.csv',encoding='gbk')
保存后,结果如下:
3.保存到数据库。
# -*- coding: utf-8 -*-
import requests
import pandas as pd
import numpy as np
import json
import html
import urllib
import sys
import re
import random
import MySQLdb
import time
from threading import Timer
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
headers ={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'}
db = MySQLdb.connect(host = 'localhost',port = 3306,user = 'zhouliye',passwd = '123456zz',db = 'abbc',charset='utf8')
print '连接上数据库了!'
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS shengxb")
sql = """CREAtE TABLE SHENGXB(
beginTime DATETIME,
endTime DATETIME,
investedMoney float,
name CHAR(50))"""
cursor.execute(sql)
def get_html1(i):
url = 'https://www.ppmoney.com/StepUp/List/-1/{}/fixedterm/true/false?_={}'
html = requests.get(url.format(i,random.randint(1501050773102,1501051774102)),headers=headers)
ceshi1=html.content
data = json.loads(ceshi1)
return(data['PackageList']['Data'])
data_ceshi=pd.DataFrame([])#建立一个空数据框
html_list =[]#建立一个空列表
for i in range(10):
html_list.append(get_html1(i))
for i in html_list:
for j in i:
a = j['beginTime']
b = j['endTime']
c = j['investedMoney']
d = j['name']
print u'开始时间: ' + str(a) + u'结束时间: '+ str(b) + u'投资金额: ' + str(c) + u'项目名称' + str(d)
insert_SHENGXB = ("INSERT INTO SHENGXB (beginTime,endTime,investedMoney,name) VALUES(%s,%s,%s,%s)")
data_data= (a, b, c, d)
cursor.execute(insert_SHENGXB,data_data)
db.commit()
print '******完成此条插入!'
print '爬取数据并插入mysql数据库完成...'
保存后显示如下:
首先,保存为txt格式。话不多说,直接上代码!
# -*- coding: utf-8 -*-
import requests
import json
import html
import urllib
import sys
import re
import random
import time
from threading import Timer
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf-8')
headers ={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'}
def get_html1(i):
url = 'https://www.ppmoney.com/StepUp/List/-1/{}/fixedterm/true/false?_={}'
html = requests.get(url.format(i,random.randint(1501050773102,1501051774102)),headers=headers)
return html.content
def get_data1(html):
data1 = json.loads(html)
data = data1['PackageList']['Data']
for i in data:
#产品名称,利率,金额
print i['name'],'\t',i['profit'],'\t',i['investedMoney']
with open('d:PPmonenyshengxinbao9.6.txt','a') as f:
f.write(i['name']+'\t'+str(i['profit'])+'\t'+str(i['investedMoney'])+'\n'
for i in range(1,10):
get_data1(get_html1(i))
执行代码后,生成文件打开后显示如下:
2.保存为CSV格式。
# -*- coding: utf-8 -*-
import requests
import pandas as pd
import numpy as np
import json
import html
import urllib
import sys
import re
import random
import time
from threading import Timer
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
headers ={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'}
def get_html1(i):
url = 'https://www.ppmoney.com/StepUp/List/-1/{}/fixedterm/true/false?_={}'
html = requests.get(url.format(i,random.randint(1501050773102,1501051774102)),headers=headers)
ceshi1=html.content
data = json.loads(ceshi1)
return(data['PackageList']['Data'])
data_ceshi=pd.DataFrame([])
html_list=[]
for i in range(100):
html_list.append(get_html1(i))
for i,heml_avg in enumerate(html_list):
tmp=pd.DataFrame(heml_avg)
tmp["page_id"]=i
data_ceshi=data_ceshi.append(tmp)
print data_ceshi
data_ceshi.to_csv('e:/data.csv',encoding='gbk')
保存后,结果如下:
3.保存到数据库。
# -*- coding: utf-8 -*-
import requests
import pandas as pd
import numpy as np
import json
import html
import urllib
import sys
import re
import random
import MySQLdb
import time
from threading import Timer
from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
headers ={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'}
db = MySQLdb.connect(host = 'localhost',port = 3306,user = 'zhouliye',passwd = '123456zz',db = 'abbc',charset='utf8')
print '连接上数据库了!'
cursor = db.cursor()
cursor.execute("DROP TABLE IF EXISTS shengxb")
sql = """CREAtE TABLE SHENGXB(
beginTime DATETIME,
endTime DATETIME,
investedMoney float,
name CHAR(50))"""
cursor.execute(sql)
def get_html1(i):
url = 'https://www.ppmoney.com/StepUp/List/-1/{}/fixedterm/true/false?_={}'
html = requests.get(url.format(i,random.randint(1501050773102,1501051774102)),headers=headers)
ceshi1=html.content
data = json.loads(ceshi1)
return(data['PackageList']['Data'])
data_ceshi=pd.DataFrame([])#建立一个空数据框
html_list =[]#建立一个空列表
for i in range(10):
html_list.append(get_html1(i))
for i in html_list:
for j in i:
a = j['beginTime']
b = j['endTime']
c = j['investedMoney']
d = j['name']
print u'开始时间: ' + str(a) + u'结束时间: '+ str(b) + u'投资金额: ' + str(c) + u'项目名称' + str(d)
insert_SHENGXB = ("INSERT INTO SHENGXB (beginTime,endTime,investedMoney,name) VALUES(%s,%s,%s,%s)")
data_data= (a, b, c, d)
cursor.execute(insert_SHENGXB,data_data)
db.commit()
print '******完成此条插入!'
print '爬取数据并插入mysql数据库完成...'
保存后显示如下:
相关文章推荐
- Python数据处理-将数据保存为txt、csv等文件格式方法
- Python脚本---把MySQL数据库表中的数据导出生成csv格式文件
- Python实现将MySQL数据库表中的数据导出生成csv格式文件的方法
- python通过串口读取GPS NMEA格式的数据,并保存为csv文件
- 利用Python爬取妙笔阁小说网站的小说信息并保存为txt和csv格式
- Python3将数据保存为txt文件
- python读取csv大数据文件到mysql数据库中(ubunu14.04下)
- python---爬取中国彩票网的双色球数据,保存txt与xls格式。object has no attribute 'pipelines'
- python处理CSV文件格式数据
- 利用python抓取搜狗关于数据分析的文章并保存到csv文件
- 用Python将gml文件中边的信息输出为csv(或者txt)格式
- [置顶] [原创]自己动手写CSDN博客提取器,提取文件保存支持PDF、doc、txt三种格式
- python3将csv文件中的两列数据读取出来,并且按行写入txt文本之中
- 自己动手写CSDN博客提取器,提取文件保存支持PDF、doc、txt三种格式
- 几行Python代码生成饭店营业额模拟数据并保存为CSV文件
- txt或者csv数据文件的格式是有要求的,如下shell代码中说明。
- matlab中将数据输出保存为txt格式文件的方法
- python用read_csv导入txt文件时的数据丢失问题
- C编写以二进制读取文件(任意格式eg :图片),保存到C语言数据格式的TXT文档
- 自己动手写CSDN博客提取器,提取文件保存支持PDF、doc、txt三种格式