您的位置:首页 > 其它

爬腾讯视频所有类型的电影

2015-06-28 20:18 260 查看
抓取腾讯视频存入数据库!

#coding: utf-8
import re
import urllib2
from bs4 import BeautifulSoup
import time
import MySQLdb
import sys
reload(sys)
sys.setdefaultencoding('utf8')

NUM = 0         #全局变量。电影数量
m_type = u''    #全局变量。电影类型
m_site = u'qq'  #全局变量。电影网站
movieStore = []     #存储电影信息

#根据指定的URL获取网页内容
def getHtml(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
timeout = 30
req = urllib2.Request(url, None, headers)
response = urllib2.urlopen(req, None, timeout)
return response.read()

#从电影分类列表页面获取电影分类
def getTags(html):
global m_type
soup = BeautifulSoup(html)
#return soup
tags_all = soup.find_all('ul', {'class': 'clearfix _group', 'gname': 'mi_type'})
#print len(tags_all), tags_all
#print str(tags_all[0]).replace('\n', '')
#<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html" title="动作" tvalue="0">动作</a>
reTags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'
pattern = re.compile(reTags, re.DOTALL)

tags = pattern.findall(str(tags_all[0]))
if tags:
tagsURL = {}
for tag in tags:
#print tag
tagURL = tag[0].decode('utf-8')
m_type = tag[1].decode('utf-8')
tagsURL[m_type] = tagURL

else:
print "Not Find"
return tagsURL

#获取每个分类的页数
def getPages(tagUrl):
tag_html = getHtml(tagUrl)
#div class="paginator
soup = BeautifulSoup(tag_html)      #过滤出标记页面的html
#print soup
#<div class="mod_pagenav" id="pager">
div_page = soup.find_all('div', {'class': 'mod_pagenav', 'id': 'pager'})
#print div_page[0]

#<a _hot="movie.page2." class="c_txt6" href="http://v.qq.com/list/1_18_-1_-1_1_0_1_20_0_-1_0_-1.html" title="2"><span>2</span></a>
re_pages = r'<a _hot=.+?><span>(.+?)</span></a>'
p = re.compile(re_pages, re.DOTALL)
pages = p.findall(str(div_page[0]))
#print pages
if len(pages) > 1:
return pages[-2]
else:
return 1

#获取电影列表
def getMovieList(html):
soup = BeautifulSoup(html)
#<ul class="mod_list_pic_130">
divs = soup.find_all('ul', {'class': 'mod_list_pic_130'})
#print divs
for divHtml in divs:
divHtml = str(divHtml).replace('\n', '')
#print divHtml
getMovie(divHtml)

def getMovie(html):
global NUM
global m_type
global m_site

reMovie = r'<li><a _hot=\"movie\.image\.link\.1\.\" class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\">.+?</li>'
p = re.compile(reMovie, re.DOTALL)
movies = p.findall(html)
#print movies
if movies:
for movie in movies:
#print movie
NUM += 1
#print "%s : %d" % ("=" * 70, NUM)
'''
values = dict(
movieTitle=movie[1],
movieUrl=movie[0],
movieSite=m_site,
movieType=m_type
)
print values
'''
eachMovie = [NUM, movie[1], movie[0], m_type]
movieStore.append(eachMovie)

#数据库插入数据,自己创建表,字段为:number, title, url, type
def db_insert(insert_list):
try:
conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="meimei1118", db="ctdata", charset='utf8')
cursor = conn.cursor()
cursor.execute('delete from movies')
cursor.execute('alter table movies AUTO_INCREMENT=1')
cursor.executemany("INSERT INTO movies(number,title,url,type) VALUES(%s, %s, %s,%s)", insert_list)
conn.commit()
cursor.close()
conn.close()

except MySQLdb.Error, e:
print "Mysql Error %d: %s" %(e.args[0], e.args[1])

if __name__ == "__main__":
url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
html = getHtml(url)
tagUrls = getTags(html)
#print tagHtml
#print tagUrls

for url in tagUrls.items():
#print str(url[1]).encode('utf-8'), url[0]
#getPages(str(url[1]))
maxPage = int(getPages(str(url[1]).encode('utf-8')))
print maxPage

for x in range(0, 5):
#http://v.qq.com/list/1_18_-1_-1_1_0_0_20_0_-1_0_-1.html
m_url = str(url[1]).replace('0_20_0_-1_0_-1.html', '')
#print m_url
movie_url = "%s%d_20_0_-1_0_-1.html" % (m_url, x)
#print movie_url
movie_html = getHtml(movie_url.encode('utf-8'))
#print movie_html
getMovieList(movie_html)
time.sleep(10)

db_insert(movieStore)


数据存入MySQL:



#coding: utf-8
import re
import urllib2
import datetime
import Queue
from bs4 import BeautifulSoup
import time
import MySQLdb
import sys
reload(sys)
sys.setdefaultencoding('utf8')

NUM = 0         #全局变量。电影数量
m_type = u''    #全局变量。电影类型
m_site = u'qq'  #全局变量。电影网站
movieStore = []     #存储电影信息

#根据指定的URL获取网页内容
def getHtml(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
timeout = 30
req = urllib2.Request(url, None, headers)
response = urllib2.urlopen(req, None, timeout)
return response.read()

#从电影分类列表页面获取电影分类
def getTags(html):
global m_type
soup = BeautifulSoup(html)
#return soup
tags_all = soup.find_all('ul', {'class': 'clearfix _group', 'gname': 'mi_type'})
#print len(tags_all), tags_all
#print str(tags_all[0]).replace('\n', '')
#<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html" title="动作" tvalue="0">动作</a>
reTags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'
pattern = re.compile(reTags, re.DOTALL)

tags = pattern.findall(str(tags_all[0]))
if tags:
tagsURL = {}
for tag in tags:
#print tag
tagURL = tag[0].decode('utf-8')
m_type = tag[1].decode('utf-8')
tagsURL[m_type] = tagURL

else:
print "Not Find"
return tagsURL

#获取每个分类的页数
def getPages(tagUrl):
tag_html = getHtml(tagUrl)
#div class="paginator
soup = BeautifulSoup(tag_html)      #过滤出标记页面的html
#print soup
#<div class="mod_pagenav" id="pager">
div_page = soup.find_all('div', {'class': 'mod_pagenav', 'id': 'pager'})
#print div_page[0]

#<a _hot="movie.page2." class="c_txt6" href="http://v.qq.com/list/1_18_-1_-1_1_0_1_20_0_-1_0_-1.html" title="2"><span>2</span></a>
re_pages = r'<a _hot=.+?><span>(.+?)</span></a>'
p = re.compile(re_pages, re.DOTALL)
pages = p.findall(str(div_page[0]))
#print pages
if len(pages) > 1:
return pages[-2]
else:
return 1

#获取电影列表
def getMovieList(html):
soup = BeautifulSoup(html)
#<ul class="mod_list_pic_130">
divs = soup.find_all('ul', {'class': 'mod_list_pic_130'})
#print divs
for divHtml in divs:
divHtml = str(divHtml).replace('\n', '')
#print divHtml
getMovie(divHtml)

def getMovie(html):
global NUM
global m_type
global m_site

reMovie = r'<li><a _hot=\"movie\.image\.link\.1\.\" class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\">.+?</li>'
p = re.compile(reMovie, re.DOTALL)
movies = p.findall(html)
#print movies

if movies:
for movie in movies:
#print movie
NUM += 1
#print "%s : %d" % ("=" * 70, NUM)
'''
values = dict(
movieTitle=movie[1],
movieUrl=movie[0],
movieSite=m_site,
movieType=m_type
)
print values
'''
eachMovie = [NUM, movie[1], movie[0], datetime.datetime.now(), m_type]
#cursor.execute('alter table movies AUTO_INCREMENT=1')
cursor.execute("INSERT INTO movies(number,title,url,time, type) VALUES(%s, %s, %s,%s,%s)", eachMovie)
conn.commit()
#movieStore.append(eachMovie)

#数据库插入数据,自己创建表,字段为:number, title, url, type
def db_insert(insert_list):
try:
conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="meimei1118", db="ctdata", charset='utf8')
cursor = conn.cursor()
cursor.execute('delete from movies')
cursor.execute('alter table movies AUTO_INCREMENT=1')
cursor.executemany("INSERT INTO movies(number,title,url,type) VALUES(%s, %s, %s,%s)", insert_list)
conn.commit()
cursor.close()
conn.close()

except MySQLdb.Error, e:
print "Mysql Error %d: %s" %(e.args[0], e.args[1])

if __name__ == "__main__":
url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
html = getHtml(url)
tagUrls = getTags(html)
#print tagHtml
#print tagUrls
try:
conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="meimei1118", db="ctdata", charset='utf8')
cursor = conn.cursor()
cursor.execute('delete from movies')
for url in tagUrls.items():
#print str(url[1]).encode('utf-8'), url[0]
#getPages(str(url[1]))
maxPage = int(getPages(str(url[1]).encode('utf-8')))
print maxPage

for x in range(0, maxPage):
#http://v.qq.com/list/1_18_-1_-1_1_0_0_20_0_-1_0_-1.html
m_url = str(url[1]).replace('0_20_0_-1_0_-1.html', '')
#print m_url
movie_url = "%s%d_20_0_-1_0_-1.html" % (m_url, x)
#print movie_url
movie_html = getHtml(movie_url.encode('utf-8'))
#print movie_html
getMovieList(movie_html)
time.sleep(10)

print u"完成..."

cursor.close()
conn.close()
except MySQLdb.Error, e:
print "Mysql Error %d: %s" %(e.args[0], e.args[1])

#db_insert(movieStore)


将上述程序根据面向对象的思想进行改写,并对电影列表获取部分进行多线程的改写。

代码主要分成两个部分,一个是方法类methods,一个是面向对象的改写

methods.py

#coding: utf-8
import re
import urllib2
import datetime
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')

NUM = 0
m_type = u''    #全局变量。电影类型
m_site = u'qq'  #全局变量。电影网站
movieStore = []     #存储电影信息

#根据指定的URL获取网页内容
def getHtml(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
timeout = 20
req = urllib2.Request(url, None, headers)
response = urllib2.urlopen(req, None, timeout)
return response.read()

#从电影分类列表页面获取电影分类
def getTags(html):
global m_type
soup = BeautifulSoup(html)
#return soup
tags_all = soup.find_all('ul', {'class': 'clearfix _group', 'gname': 'mi_type'})
#print len(tags_all), tags_all
#print str(tags_all[0]).replace('\n', '')
#<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html" title="动作" tvalue="0">动作</a>
reTags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'
pattern = re.compile(reTags, re.DOTALL)

tags = pattern.findall(str(tags_all[0]))
if tags:
tagsURL = {}
for tag in tags:
#print tag
tagURL = tag[0].decode('utf-8')
m_type = tag[1].decode('utf-8')
tagsURL[m_type] = tagURL
print m_type

else:
print "Not Find"
return tagsURL

#获取每个分类的页数
def getPages(tagUrl):
tag_html = getHtml(tagUrl)
#div class="paginator
soup = BeautifulSoup(tag_html)      #过滤出标记页面的html
#print soup
#<div class="mod_pagenav" id="pager">
div_page = soup.find_all('div', {'class': 'mod_pagenav', 'id': 'pager'})
#print div_page[0]

#<a _hot="movie.page2." class="c_txt6" href="http://v.qq.com/list/1_18_-1_-1_1_0_1_20_0_-1_0_-1.html" title="2"><span>2</span></a>
re_pages = r'<a _hot=.+?><span>(.+?)</span></a>'
p = re.compile(re_pages, re.DOTALL)
pages = p.findall(str(div_page[0]))
#print pages
if len(pages) > 1:
return pages[-2]
else:
return 1
#从指定电影块页面获取电影具体内容
def getMovie(html):
global NUM
global m_type
global m_site

reMovie = r'<li><a _hot=\"movie\.image\.link\.1\.\" class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\">.+?</li>'
p = re.compile(reMovie, re.DOTALL)
movies = p.findall(html)
#print movies
if movies:
for movie in movies:
#print movie
#NUM += 1
#print "%s : %d" % ("=" * 70, NUM)
'''
values = dict(
movieTitle=movie[1],
movieUrl=movie[0],
movieSite=m_site,
movieType=m_type
)
print values
'''
eachMovie = [movie[1], movie[0], m_type, datetime.datetime.now()]
print eachMovie
movieStore.append(eachMovie)

#获取一页的电影列表
def getMovieList(html):
soup = BeautifulSoup(html)
#<ul class="mod_list_pic_130">
divs = soup.find_all('ul', {'class': 'mod_list_pic_130'})
#print divs
for divHtml in divs:
divHtml = str(divHtml).replace('\n', '')
#print divHtml
getMovie(divHtml)


movies.py

#coding: utf-8
import re
import urllib2
import datetime
import threading
from methods import *
import time
import MySQLdb
import sys

reload(sys)
sys.setdefaultencoding('utf8')

NUM = 0      #全局变量。电影数量
movieTypeList = []

def getMovieType():
url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
html = getHtml(url)
tagUrls = getTags(html)
for url in tagUrls.items():
#print str(url[1]).encode('utf-8'), url[0]
#getPages(str(url[1]))
maxPage = int(getPages(str(url[1]).encode('utf-8')))
#print maxPage
typeList = [url[0], str(url[1]).encode('utf-8'), maxPage]
movieTypeList.append(typeList)

#多线程获取每一类型的电影列表
class GetMovies(threading.Thread):
def __init__(self, movie):
threading.Thread.__init__(self)
self.movie = movie

def getMovies(self):
maxPage = int(self.movie[2])
m_type = self.movie[0]
for x in range(0, maxPage):
#http://v.qq.com/list/1_18_-1_-1_1_0_0_20_0_-1_0_-1.html
m_url = str(self.movie[1]).replace('0_20_0_-1_0_-1.html', '')
#print m_url
movie_url = "%s%d_20_0_-1_0_-1.html" % (m_url, x)
#print movie_url, url[0]
movie_html = getHtml(movie_url.encode('utf-8'))
#print movie_html
getMovieList(movie_html)
time.sleep(5)

print m_type + u"完成..."

def run(self):
self.getMovies()

#插入数据库,表结构自己创建
def db_insert(insert_list):
try:
conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="meimei1118", db="ctdata", charset='utf8')
cursor = conn.cursor()
cursor.execute('delete from test')
cursor.execute('alter table test AUTO_INCREMENT=1')
cursor.executemany("INSERT INTO test(title,url,type,time) VALUES (%s,%s,%s,%s)", insert_list)
conn.commit()
cursor.close()
conn.close()
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])

if __name__ == "__main__":
getThreads = []
getMovieType()
start = time.time()
for i in range(len(movieTypeList)):
t = GetMovies(movieTypeList[i])
getThreads.append(t)

for i in range(len(getThreads)):
getThreads[i].start()

for i in range(len(getThreads)):
getThreads[i].join()

t1 = time.time() - start
print t1
t = time.time()
db_insert(movieStore)
t2 = time.time() - t
print t2
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: