您的位置:首页 > 数据库 > Mongodb

使用bs4对海投网内容信息进行提取并存入mongodb数据库

2015-09-29 17:11 609 查看
example: http://xyzp.haitou.cc/article/722427.html
首先是直接下载好每个页面,可以使用 os.system( "wget "+str(url)) 或者urllib2.urlopen(url) ,很简单不赘述。

然后,重头戏,进行信息抽取:

#!/usr/bin/env python
# coding=utf-8

from bs4 import BeautifulSoup
import codecs
import sys
import os
reload(sys)
sys.setdefaultencoding("utf-8")
import re

from pymongo import MongoClient

def get_jdstr(fname):
soup = ""
retdict = {}
with open(fname) as fr:
soup = BeautifulSoup(fr.read().replace('""','"'))

jdstr = soup.get_text()

retdict["inc_name"] = soup.title.string.split()[0]
retdict["page_content"] = soup.find_all("div","panel-body panel-body-text")[0].get_text()
retdict["index_url"] = re.search("http://xyzp.haitou.cc/article/\d+.html",jdstr).group()
retdict["info_from"] = soup.find_all("p","text-ellipsis")[0].contents[1].get_text()
retdict["workplace"] = soup.find_all("p","text-ellipsis")[1].contents[1].get_text()
retdict["info_tag"] = soup.find_all("p","text-ellipsis")[2].contents[1].get_text()
retdict["pub_time"] = soup.find_all("p","text-ellipsis")[3].contents[1].get_text()

return retdict

def JD_extr():
fnames = [ fname  for fname in os.listdir("./") if fname.endswith(".html") ]
fw = codecs.open("tmp_jd_haitou_clean.csv","w","utf-8")
res = []
for fname in fnames[1:500]:
tmp = []
retdict =  get_jdstr(fname)
res.append(retdict)
for k,v in retdict.iteritems():
tmp.append(v)
fw.write(" , ".join(tmp)+"\n")
fw.write("==="*20+"\n")
print fname,"done!"
return res

def change2html():
fnames = [ fname for fname in os.listdir("./") if fname.endswith(".txt") ]
for fname in fnames:
cmd = "mv "+str(fname) +" "+fname[:-3]+"html"
print cmd
os.system(cmd)

def store2mongodb():
client = MongoClient("localhost",27017)
db = client.JD_Haitou

documents = JD_extr()
for d in documents:
db.haitouJD.insert(d)

mycol = db["haitouJD"]
print mycol.count()

def split_jd_test_data(fname='./tmp_jd_haitou_clean.csv'):
fw = codecs.open('./split_jd_res.csv','w','utf-8')
fr = codecs.open(fname,'r','utf-8')
indexurl = re.compile("http://xyzp.haitou.cc/article/\d+.html")
for line in fr:
if indexurl.search(line):
url = indexurl.search(line).group()
cnt = '1'  #默认为1
fw.write(url+"\t"+cnt+"\n")
fr.close()
fw.close()

if __name__ == "__main__":
   JD_extr()  # 抽取后存入文件
store2mongodb()
split_jd_test_data()
print "done"
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: