Python爬取扇贝“【无老师】7天搞定TOEFL单词”
2018-01-18 15:33
288 查看
#!/usr/bin/env python3 from bs4 import BeautifulSoup import requests import csv import bs4 import codecs #检查url地址并返回网页contents def check_link(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: print('无法链接服务器!!!') # 判断一个unicode是否是英文字母 def is_alphabet(uchar): if ('\u0041' <= uchar<='\u005a') or ('\u0061' <= uchar<='\u007a'): return True else: return False #爬取表格数据 def get_contents(urlist): '''urlist: a list containing all the useful urls''' result = [] for url in urlist: content = check_link(url) soup = BeautifulSoup(content,'lxml') trs = soup.find_all('tr') for tr in trs: ui = [] for td in tr: ui.append(td.string) result.append(ui) return result #爬取URL链接 def get_urls(url_content,root_url="https://www.shanbay.com"): '''get all the urls from url_content and save into a list''' ulist = [] soup = BeautifulSoup(url_content,'lxml') urls=soup.find_all('a') for url in urls: try: if url.string.startswith('【无老师7天TOEFL】List'): ulist.append(root_url+url.get('href')) for j in range(2,11): extend_url = root_url+url.get('href')+'?page='+str(j) ulist.append(extend_url) except: pass return ulist def save_contents(result): '''result: all the useful result from urls''' with codecs.open('shanbay.csv', 'w', 'utf_8_sig') as f: writer = csv.writer(f) for i in range(len(result)): try: if is_alphabet(result[i][1]): writer.writerow([result[i][1],result[i][3]]) print("write in line:",i) except: print("error in line:{}, contents is:{}".format(i,result[i])) def main(): src_url = "https://www.shanbay.com/wordbook/5440/" # get the contents in source page src_content = check_link(src_url) #get all the useful urls in source page urls = get_urls(src_content) # scrapy all the useful contents from all the urls result = get_contents(urls) #save all the useful contents into csv save_contents(result) main()
效果图:
相关文章推荐
- [python爬虫]模拟登陆扇贝单词
- 张红岩的《TOEFL词汇精选》,我制订了《十天搞定TOEFL单词》
- 《十天搞定TOEFL单词》(王玉梅TOEFL词汇)
- “34天搞定4、6、TOEFL单词”时间表
- 【扇贝批量添加单词到词库】利用python调用扇贝API (oauth2)
- 【扇贝批量添加单词到词库】利用python调用扇贝API (oauth2)
- 12步轻松搞定python装饰器
- Redis的Python实践,以及四中常用应用场景详解——学习董伟明老师的《Python Web开发实践》
- python MapReduce单词统计
- 公开课发布:《1小时用Python搞定发布自动化》by张老师
- Python官方文档陌生英文单词记录本
- python读取文件里的单词,统计词频,输出到文件
- 2018最常见的Python面试题,一文看完带你搞定考官(上)
- [python]一行搞定字符串排序
- 12步轻松搞定python装饰器
- [python]使用Counter统计文章中出现频率最高的单词
- python 统计文本单词数-字典排序
- python字典用法-统计统计一句单词
- 读书札记:7天搞定C语言(一)
- hadoop, 用java 和 python 实现 worldcount 简单单词提取累加小程序