python爬虫实战(四)
2017-09-08 16:54
337 查看
#!/user/bin/python # -*- coding: UTF-8 -*- import urllib import urllib2 import lxml import re import MySQLdb import time from bs4 import BeautifulSoup import httplib httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1' user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36' hdr = { 'User-Agent' : user_agent } db = MySQLdb.connect(host="localhost", port=3306, user="root", passwd="123456", db="xiaoshuo", charset="utf8") str_sql = '''INSERT INTO `xiaoshuo`.`book1` (`bookName`, `author`, `url`, `classifyName`, `brief`, `updateTime`, `status`) VALUES''' def getBookInfoBaseOnUrl(url): global str_sql request = urllib2.Request(url, headers=hdr) response = urllib2.urlopen(request) html_data = response.read().decode('gbk') soup = BeautifulSoup(html_data,'lxml') mylist = soup.select('head') for item in mylist: bookName = item.find(property="og:novel:book_name").get("content") print "书名:", bookName author = item.find(property="og:novel:author").get("content") print "作者:", author url = item.find(property="og:novel:read_url").get("content") print "链接:", url classifyName = item.find(property="og:novel:category").get("content") print "类型:", classifyName description = item.find(property="og:description").get("content") print "brief:", description updateTime = item.find(property="og:novel:update_time").get("content") print "更新时间:", updateTime status = item.find(property="og:novel:status").get("content") print "status:", status str_sql += '("' + bookName + '", "' + author + '", "' + url + '", "' + classifyName + '", "' + description + '", "' + updateTime + '", "' + status + '"),' print "-----------------------------------------------------------------------------------------" def get_book( ): global str_sql cursor = db.cursor() #count = 0 soup = BeautifulSoup(open('biquge.html'),'lxml') mylist = soup.find_all('div', class_ ='content') for item in mylist: #print item xiaoshuo_list = item.find_all('li') for item in xiaoshuo_list: #coutn2 = 0 url = item.find('a').get('href') print "书的连接:" , url getBookInfoBaseOnUrl(url) #coutn2 = coutn2 + 1 #if coutn2 == 2: #break #count = count + 1 #if count == 2: #break for item in mylist: xiaoshuo_list = item.find_all('dl') for item in xiaoshuo_list: url = item.find('a').get('href') print "书的连接:", url getBookInfoBaseOnUrl(url) str_sql = str_sql.encode("utf-8") str_sql = str_sql[0:len(str_sql)-1] print "tmp_sql:", str_sql cursor.execute(str_sql) db.commit() cursor.close() if __name__ == "__main__": print ("<<<-----Start Get Book INFO And Save Db------>>") get_book() db.close() print str_sql
相关文章推荐
- 爬虫实战之一--python3及pip安装
- Python 爬虫实战案例 : 煎蛋网全站图片爬虫
- Python爬虫实战——模拟登录教务系统
- python3[爬虫实战] 爬虫之requests爬取新浪微博京东客服
- 0基础Python实战:爬虫计划---第一课
- Python3 爬虫实战(二)——图片爬虫
- Python爬虫框架Scrapy实战之批量抓取招聘信息
- 运维学python之爬虫基础篇实战(六)爬取百度贴吧
- Python爬虫实战学习地址
- Python爬虫实战案例:爬取爱奇艺VIP视频
- Python爬虫框架Scrapy 学习笔记 10.1 -------【实战】 抓取天猫某网店所有宝贝详情
- Python爬虫框架Scrapy 学习笔记 10.3 -------【实战】 抓取天猫某网店所有宝贝详情
- Python3爬虫实战(requests模块)
- Python爬虫学习,实战一糗事百科(2017/7/21更新)
- python 爬虫实战1 爬取糗事百科段子
- Python爬虫入门实战八:数据储存——MongoDB与MySQL
- python爬虫实战 爬取天极图片
- python3 [入门基础实战] 爬虫之四季花果园的采果模块
- 【含代码】Python爬虫实战:爬取全站小说排行榜