您的位置：首页 > 其它

pytyhon抓取网页内容get post方法

2010-08-21 16:48 417 查看

用我自己的项目做的，就是在百姓网上找的信息，然后通过post传到我自己的网站上

#!/usr/bin/python

#-*-coding:utf-8-*-

import urlparse

import sys, urllib

import re

import urllib2

urlre = re.compile(r"href=[/"']?([^ >/"']+)")

#匹配链接

ur='http://changsha.baixing.com/diannao/'

con = urllib.urlopen(ur)

result =[]

for eachline in con:

if urlre.findall(eachline):

temp = urlre.findall(eachline)

for x in temp:

#如果为站内链接，前面加上url，而且是以a开头的url，因为我想得到的是百姓<a target="_blank" href="a64095497.html">之类的网站，

if not x.startswith("http:") and x.startswith('a'):

x = urlparse.urljoin(ur,x)

result.append(x)

leng=len(result)

print leng

for url in result:

wp = urllib.urlopen(url)

#打开连接

content = wp.read()

#获取页面内容

titlere = re.compile(r"<title>(.*?)</title>",re.I)

#匹配网页源代码中的title

title = titlere.search(content).group(1).decode('UTF-8').encode('GBK')

title=title.decode('mbcs')

#因为python的字符处理，得到的将是乱码，这是将得到的中文转换成unicode

con = re.compile(r"([/s/S]*)",re.I)

#匹配网页代码中的中的字段

cont = con.search(content).group(1).decode('UTF-8').encode('GBK')

#将得到的字符转换成unicode

print cont

s=cont.split(' ')

#将得到的内容通过split分成一个list，具体的看网页源代码

s[0]=s[0].decode('mbcs').replace('发布时间：','')

#将字符转换，然后进行替换

print s[0]

s[1]=s[1].decode('mbcs').replace('所在地：','')

print s[1]

#判断汉字！这很重要，网上根本查不到处理的方法

if '具体' in s[2].decode('mbcs'):

s[1]=s[1]+''+s[2].decode('mbcs').replace('具体地点：','')

s[2]=s[3].decode('mbcs').replace('价格：','')

s[3]=s[4]

else:

s[2]=s[2].decode('mbcs').replace('价格：','')

t=''

for i in s[3:-1]:

i=i.decode('mbcs')+''

t+=i

if '百姓网' in s[-1].decode('mbcs'):

s[-1]=s[-1].decode('mbcs').replace('百姓网','聚信网')

print s[-1]

t+=s[-1]

print s[0]

url = 'http://www.jixincs.info/addInfo'

#打开要输入的网站

values = ({'category':'1',

'school':'',

'subcategory':'5001',

'title':title,

'sumbit':'',

'price':s[2],

'phone':'',

'content':t,

'address':s[1],

'file':''})

#将Post的值通过字典进行传递

data = urllib.urlencode(values)

req = urllib2.Request(url, data)

response = urllib2.urlopen(req)

the_page = response.read()

#网页抓取的关键点就是汉字的字符转换问题，还有就是正则表达式的匹配问题

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航