python抓取网页例子
2015-09-03 21:52
543 查看
python抓取网页例子
最近在学习python,刚刚完成了一个网页抓取的例子,通过python抓取全世界所有的学校以及学院的数据,并存为xml文件。数据源是人人网。因为刚学习python,写的代码还不够Pythonic。
核心代码如下:
#!/usr/bin/python import urllib.request from html.parser import HTMLParser import json import time import xml.dom.minidom import os class Dept(): id = 0 name = '' class University(Dept): depts = [] class City(Dept): universities = [] class Country(Dept): cities = [] class MyHtmlParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.links = [] self.depts = [] def handle_starttag(self, tag, attrs): if tag == 'option': for att in attrs: for a in att: if a != 'value' and a != '': self.depts.append(a) def readDept(code): depts = [] html = '' for word in urllib.request.urlopen('http://www.renren.com/GetDep.do?id=' + str(code)).readlines(): real = word.strip().decode('gbk') html = html + real hp = MyHtmlParser() hp.feed(html) for inst in hp.depts: dept = Dept() dept.name = inst depts.append(dept) return depts def writeXml(city): impl = xml.dom.minidom.getDOMImplementation() dom = impl.createDocument(None, 'city', None) root = dom.documentElement filename = city.name + '.xml' if os.path.isfile(filename): os.remove(filename) nameE = dom.createElement('name') nameT = dom.createTextNode(city.name) idE = dom.createElement('id') idT = dom.createTextNode(str(city.id)) nameE.appendChild(nameT) idE.appendChild(idT) root.appendChild(nameE) root.appendChild(idE) univs = dom.createElement('universities') root.appendChild(univs) for uni in city.universities: # print('write xml' + city.name + '\t' + uni.name) universityE = dom.createElement('university') univs.appendChild(universityE) uniE = dom.createElement('name') uniT = dom.createTextNode(uni.name) uidE = dom.createElement('id') uidT = dom.createTextNode(str(uni.id)) uniE.appendChild(uniT) uidE.appendChild(uidT) universityE.appendChild(uniE) universityE.appendChild(uidE) deptsE = dom.createElement('departments') universityE.appendChild(deptsE) for dep in uni.depts: deptE = dom.createElement('department') deptsE.appendChild(deptE) deptNameE = dom.createElement('name') deptIdE = dom.createElement('id') deptT = dom.createTextNode(dep.name) deptIdT = dom.createTextNode(str(dep.id)) deptNameE.appendChild(deptT) deptIdE.appendChild(deptIdT) deptE.appendChild(deptNameE) f= open(filename, 'w', encoding='utf-8') dom.writexml(f, addindent=' ', newl='\n',encoding='utf-8') print('write xml :' + city.name + '.xml') f.close() def mkdir(path): path=path.strip() path=path.rstrip("/") isExists=os.path.exists(path) if not isExists: os.makedirs(path) def readData(content): counties = [] jdata = json.loads(content) for i in range(0,100): try: country = Country() country.name = jdata[i]['name'] country.id = jdata[i]['id'] provs = jdata[i]['provs'] for prov in provs: city = City() city.name = prov['name'] city.id = prov['id'] country.cities.append(city) city.universities = [] for dic in prov['univs']: university = University() university.id = dic['id'] university.name = dic['name'] # print('get data: \t' + university.name) university.depts = readDept(university.id) city.universities.append(university) print('city = ' + city.name + '\tuniversity = ' + university.name) writeXml(city) counties.append(country) except IndexError: break; return counties print('开始时间:' + time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))) f = open('data','r' ) content = f.read() f.close() counties = readData(content) print('结束时间:' + time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
其中data是从如下网站拿到的
http://s.xnimg.cn/allunivlist.js
相关文章推荐
- 【转】Python3.x移除了callable内建函数
- "Python"学习笔记(二)
- Python os-操作系统接口
- python list元素为dict时的排序
- Python
- [转]在Ubuntu中安装Python科学计算环境
- 利用Python写的展示汉诺塔(hanoi)解法的小程序_Prj003
- python重载运算符
- python若干小函数的使用
- python 爬虫获取网站信息(一)
- 欢迎使用CSDN-markdown编辑器
- python内置函数zip
- Python Serial 与STM32J进行串口通讯
- webdriver自动化测试_键盘事件 python
- python学习笔记(-)
- (四)处理列表数据
- 用python写一个温度转换程序
- 系统性能信息模块篇psutil之系统进程管理方法
- 【Python学习日记】 第三天
- python、pip安装