【Python编程】网页URL提取实例
2014-02-25 18:29
633 查看
import urllib.request
import re
def ssubcatagory(urllink,j):
fp = urllib.request.urlopen(urllink)
filecontent = fp.read()
content = str(filecontent)
if(content.find("UTF-8",0,1000) != -1 or content.find("utf-8",0,1000) != -1):
mystr = filecontent.decode('UTF-8')
elif(content.find("gbk",0,1000) != -1 or content.find("GBK",0,1000) != -1):
mystr = filecontent.decode('GBK')
else:
mystr = filecontent.decode('GB2312')
if(j <= 10) :
ssubcata = re.findall(r'<div class="\kag sclearfix\">(.*?)</div></div></div>',mystr)
for i in ssubcata:
#print(i)
sssubcata = re.findall(r'blank\">(.*?)</a>',i)
print("三级:",sssubcata[0])
#urladdress =re.findall(r'href=\"http\://(.*?)\" class=\"kaj\">(.*?)</a>',i)
urladdress =re.findall(r'href=\"http\://(.*?)\" class=\"kaj\">(.*?)</a>',i)
for k in urladdress:
print(k[0],k[1])
#print(ssubcata[0])
#print(i,subcata)
def subcatagory(mystr,j):
k = 1
if(j == 1) :
subcata = re.findall(r'生活服务</b></h3><ul class=\"list clearfix\">(.*?)</ul></div>',mystr)
#href="http://gouwu.hao123.com/" class="link"><b>购物</b>
print("一级:生活服务--")
ssubcata = re.findall(r'href=\"(.*?)\" class=\"link\"><b>(.*?)</b>',str(subcata))
for i in ssubcata:
print("二级:",i[0],i[1])
#gouwu caipiao and so on
ssubcatagory(i[0],k)
k = k + 1
#print(ssubcata[0])
#print(i,subcata)
def mainfun():
url = 'http://www.hao123.com/sitemap'
fp = urllib.request.urlopen(url)
content = fp.read()
mystr = content.decode('GBK')
#print(mystr)
file = open('results.txt','w')
file.write(mystr)
file.close
#<div class="section" id="生活服务">
catapattern = re.findall(r'<div class=\"section\" id=\"(.*?)\">',mystr)
#print(catapattern[0])
j = 0
for i in catapattern:
j = j + 1
subcatagory(mystr,j)
#print(catapattern)
fp.close()
mainfun()
import re
def ssubcatagory(urllink,j):
fp = urllib.request.urlopen(urllink)
filecontent = fp.read()
content = str(filecontent)
if(content.find("UTF-8",0,1000) != -1 or content.find("utf-8",0,1000) != -1):
mystr = filecontent.decode('UTF-8')
elif(content.find("gbk",0,1000) != -1 or content.find("GBK",0,1000) != -1):
mystr = filecontent.decode('GBK')
else:
mystr = filecontent.decode('GB2312')
if(j <= 10) :
ssubcata = re.findall(r'<div class="\kag sclearfix\">(.*?)</div></div></div>',mystr)
for i in ssubcata:
#print(i)
sssubcata = re.findall(r'blank\">(.*?)</a>',i)
print("三级:",sssubcata[0])
#urladdress =re.findall(r'href=\"http\://(.*?)\" class=\"kaj\">(.*?)</a>',i)
urladdress =re.findall(r'href=\"http\://(.*?)\" class=\"kaj\">(.*?)</a>',i)
for k in urladdress:
print(k[0],k[1])
#print(ssubcata[0])
#print(i,subcata)
def subcatagory(mystr,j):
k = 1
if(j == 1) :
subcata = re.findall(r'生活服务</b></h3><ul class=\"list clearfix\">(.*?)</ul></div>',mystr)
#href="http://gouwu.hao123.com/" class="link"><b>购物</b>
print("一级:生活服务--")
ssubcata = re.findall(r'href=\"(.*?)\" class=\"link\"><b>(.*?)</b>',str(subcata))
for i in ssubcata:
print("二级:",i[0],i[1])
#gouwu caipiao and so on
ssubcatagory(i[0],k)
k = k + 1
#print(ssubcata[0])
#print(i,subcata)
def mainfun():
url = 'http://www.hao123.com/sitemap'
fp = urllib.request.urlopen(url)
content = fp.read()
mystr = content.decode('GBK')
#print(mystr)
file = open('results.txt','w')
file.write(mystr)
file.close
#<div class="section" id="生活服务">
catapattern = re.findall(r'<div class=\"section\" id=\"(.*?)\">',mystr)
#print(catapattern[0])
j = 0
for i in catapattern:
j = j + 1
subcatagory(mystr,j)
#print(catapattern)
fp.close()
mainfun()
相关文章推荐
- python-获取提取网页url爬虫学习(1)
- 【Python编程】网页中文提取正则
- python使用正则表达式提取网页URL的方法
- Python 网络爬虫 009 (编程) 通过正则表达式来获取一个网页中的所有的URL链接,并下载这些URL链接的源代码
- 用python正则表达式提取网页的url
- Python 网络爬虫 009 (编程) 通过正则表达式来获取一个网页中的所有的URL链接,并下载这些URL链接的源代码
- python3爬取百度搜索结果url,获得真实url,提取网页正文并分词,多进程的使用
- python系列之数据处理编程实例
- 【Python编程】读取网页内容并存储过滤
- 通过实例浅析Python对比C语言的编程思想差异
- python 编程实例 2
- Python实现抓取网页并且解析的实例
- python提取页面内url列表的方法
- Python编程求质数实例代码
- asp.net正则表达式提取网页网址、标题、图片实例以及过滤所有HTML标签实例
- 黄聪:使用Python中的HTMLParser、cookielib抓取和解析网页、从HTML文档中提取链接、图像、文本、Cookies(二)
- Python -- 网络编程 -- 抓取网页图片 -- 图虫网
- Python设计模式编程中Adapter适配器模式的使用实例
- Python socket编程实例详解