您的位置:首页 > 编程语言 > Python开发

【Python编程】网页URL提取实例

2014-02-25 18:29 633 查看
import urllib.request

import re

def ssubcatagory(urllink,j):

    fp = urllib.request.urlopen(urllink)

    filecontent = fp.read()

    content = str(filecontent)

    if(content.find("UTF-8",0,1000) != -1 or content.find("utf-8",0,1000) != -1):

        mystr = filecontent.decode('UTF-8')

    elif(content.find("gbk",0,1000) != -1 or content.find("GBK",0,1000) != -1):

        mystr = filecontent.decode('GBK')

    else:

        mystr = filecontent.decode('GB2312')

           

    if(j <= 10) :

        ssubcata = re.findall(r'<div class="\kag sclearfix\">(.*?)</div></div></div>',mystr)

        for i in ssubcata:

            #print(i)

            sssubcata = re.findall(r'blank\">(.*?)</a>',i)

            print("三级:",sssubcata[0])

            #urladdress =re.findall(r'href=\"http\://(.*?)\" class=\"kaj\">(.*?)</a>',i)

            urladdress =re.findall(r'href=\"http\://(.*?)\" class=\"kaj\">(.*?)</a>',i)

            for k in urladdress:

                print(k[0],k[1])

               

                               

           

        #print(ssubcata[0])

   

    #print(i,subcata)

 

def subcatagory(mystr,j):

    k = 1

    if(j == 1) :

        subcata = re.findall(r'生活服务</b></h3><ul class=\"list clearfix\">(.*?)</ul></div>',mystr)

        #href="http://gouwu.hao123.com/" class="link"><b>购物</b>

        print("一级:生活服务--")

        ssubcata = re.findall(r'href=\"(.*?)\" class=\"link\"><b>(.*?)</b>',str(subcata))

        for i in ssubcata:

            print("二级:",i[0],i[1])

            #gouwu caipiao and so on

            ssubcatagory(i[0],k)

            k = k + 1

        #print(ssubcata[0])

   

    #print(i,subcata)

def mainfun():

    url = 'http://www.hao123.com/sitemap'

    fp = urllib.request.urlopen(url)

    content = fp.read()

    mystr = content.decode('GBK')

    #print(mystr)

    file = open('results.txt','w')

    file.write(mystr)

    file.close

    #<div class="section" id="生活服务">

    catapattern = re.findall(r'<div class=\"section\" id=\"(.*?)\">',mystr)

    #print(catapattern[0])

    j = 0

    for i in catapattern:

        j = j + 1

        subcatagory(mystr,j)

       

    #print(catapattern)

    fp.close()

mainfun()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: