您的位置：首页 > 编程语言 > Python开发
使用python爬取猫眼电影、房王、股吧论坛、百度翻译、有道翻译、高德天气、华夏基金、扇贝单词、糗事百科（华夏基金）

2019-05-26 22:20 459 查看
import requests,re

#获取整个网页
# with open('华夏基金.html','w',encoding='utf-8') as f:
#     f.write(html)

class Huaxia():

# def __init__(self):
#     self.base_html()

def __call__(self, *args, **kwargs):
self.base_html()

def base_html(self):
'''
获取华夏基金(全部基金页面：http://fund.chinaamc.com/portal/cn/include/newproducthome.jsp
:return: 华夏基金网页文件
'''
base_url = 'http://fund.chinaamc.com/portal/cn/include/newproducthome.jsp'
html = requests.get(base_url).text
# print(type(html)) #<class 'str'>
# print('base_html is run')
self.all_table_data(html)

def all_table_data(self,html):
'''
获取所有网站table内容
规则：
开头：<table width="100%" border="0" cellspacing="0" cellpadding="0" style=
结尾：</table>
贪婪模式
:return:返回所有符合 首：table width="100%" border="0" cellspacing="0" cellpadding="0" style=  尾：</table> 的table
'''
#创建贪婪模式规则获取不同类型的基金
pattern_big_table_rule = re.compile('<table width="100%" border="0" cellspacing="0" cellpadding="0" style=(.*?)</table>',re.S)
big_tables = pattern_big_table_rule.findall(html)
# print(type(big_tables))
# print(len(big_tables))
# count=1
# for table in big_tables:
#     big_table_name = '华夏基金第'+str(count)+'个table'+'.html'
#     with open(big_table_name,'w',encoding='utf-8') as f:
#         f.write(table)
#         count+=1
# print('all_table_data is run ')

#匹配table类型：一共四种
#   1、股票型、指数型、混合型、债券型、ETF
#   2、货币型
#   3、理财型
#   4、封闭型

#数据分发
type1 = big_tables[0]
# self.type_1(type1)
type2 = big_tables[1]
self.typer_2(type2)
type3 = big_tables[2]
# self.typer_3(type3)
type4 = big_tables[3]
# self.typer_4(type4)

'''
#这是typr_1
'''
def type_1(self,type1):

'''
type1: 这是第一类基金类型:基金 股票型、指数型、混合型、债券型、ETF 类基金
需要获取的内容：
1.基金简称
2.基金代码
3.净值日期
4.净值
5.累计净值
6.涨跌幅
7.成立日期
8.申购状态
9.赎回状态
10.定投状态
11.网上交易
:param
:return:第一类 基金 股票型、指数型、混合型、债券型、ETF 类基金
数据结构：字典
字典结构：一只基金一个
'''

#一、整理数据
# 1、获取type1基金导航栏信息

type1_nvg_titles_rule = re.compile('class="p16_libe">(.*?)</span>')
# 基金导航栏字典
# 只需要前9个
# ['基金简称', '基金代码', '净值日期', '净值', '累计净值', '涨跌幅', '成立日期', '申购状态', '赎回状态', '定投状态', '网上交易', '添加自选']
type1_nvg_tltles = type1_nvg_titles_rule.findall(type1)
# print(type1_nvg_tltles)  # 基金导航栏列

#2、获取type1中 每一只基金列表
#基金列表  funds_list
funds_rule = re.compile('position: relative(.*?)style="color:red',re.S)
funds_list = funds_rule.findall(type1)
# print(len(funds_list))#期望： 195  时间5019/5/25

#二、获取每一只基金的信息
#1、获取单只基金信息
count =1
for fund in funds_list:
#获取每一只基金的 详细信息

#独立规则
#基金名称
fund_name_rules = re.compile(' title="(.*?)" target')
fund_name = fund_name_rules.findall(fund)
# print(count,fund_name)

#其它规则
fund_other_info_rule = re.compile('<td height="30">(.*?)</td>')
fund_other_info_list = fund_other_info_rule.findall(fund)
# print(fund_other_info_list)# 注意，名字在上面 [' 000041', '2019-05-23', '0.956', '0.956', '-1.75%', '2007-10-09', '开放', '开放', '开放']

#组织数据结构
#导航格式：
#   type1_nvg_tltles
#['基金简称', '基金代码', '净值日期', '净值', '累计净值', '涨跌幅', '成立日期', '申购状态', '赎回状态', '定投状态', '网上交易', '添加自选']
#内容规则：
#基金名称：fund_name
#基金的其它信息：fund_other_info_list
#注意，名字在上面 [' 000041', '2019-05-23', '0.956', '0.956', '-1.75%', '2007-10-09', '开放', '开放', '开放']
funds_detail={
type1_nvg_tltles[0]:fund_name[0],
type1_nvg_tltles[1]:fund_other_info_list[0],
type1_nvg_tltles[2]:fund_other_info_list[1],
type1_nvg_tltles[3]:fund_other_info_list[2],
type1_nvg_tltles[4]:fund_other_info_list[3],
type1_nvg_tltles[5]:fund_other_info_list[4],
type1_nvg_tltles[6]:fund_other_info_list[5],
type1_nvg_tltles[7]:fund_other_info_list[6],
type1_nvg_tltles[8]:fund_other_info_list[7],
type1_nvg_tltles[9]:fund_other_info_list[8],

}
print(count,funds_detail)
count += 1
print('年轻人 要有耐心')

'''
#这是typr_2
'''
def typer_2(self, type2):
'''
type2: 这是第二类基金类型: 货币型
需要获取的内容：
1.基金简称
2.基金代码
3.净值日期
4.万份收益（元）
5.七日年化收益率(%)
6.运作期年化收益率(%)
7.成立日期
8.申购状态
9.赎回状态
10.定投状态

:param
:return:第二类基金类型: 货币型
数据结构：字典
字典结构：一只基金对应一个字典
'''

# 一、获取type2基金导航栏信息
#导航栏存在两种规则！！
# 注意，还有四个nvg规则不同
# ['基金简称', '基金代码', '净值日期', '成立日期', '申购状态', '赎回状态', '定投状态', '网上交易', '添加自选']
# 拼接后的导航栏为
# ['基金简称', '基金代码', '净值日期', '成立日期', '申购状态', '赎回状态', '定投状态', '网上交易', '添加自选', '（百）万份收益', '七日年化收益率(%)', '最近30天的年化(%)', '今年以来年化(%)]'

#1.获取导航栏信息
type2_nvg_titles_left_rule = re.compile('class="p16_libe">(.*?)</span>')    #导航规则1
type2_nvg_titles_right_rule = re.compile('style=" line-height:20px;">(.*?)</span></td>')   #导航规则2
type2_nvg_tltles_left = type2_nvg_titles_left_rule.findall(type2)   #匹配航规则1结果
type2_nvg_tltles_right = type2_nvg_titles_right_rule.findall(type2) #匹配航规则2结果
print(type2_nvg_tltles_left)  # 基金导航栏列1
print(type2_nvg_tltles_right)  # 基金导航栏列2

#type2导航栏 顺序与原网页不同，组合数据结构时需要注意
type2_nvg_tltles =type2_nvg_tltles_left+type2_nvg_tltles_right

print('type2的导航栏为：',type2_nvg_tltles)

# 二、获取type2中 每一只基金列表
# 基金列表  funds_list
type2_funds_rule = re.compile('onclick="moveToComPare(.*?)添加</a></td>', re.S)
type2_funds_list = type2_funds_rule.findall(type2)
# print(len(type2_funds_list))#期望： 20  时间5019/5/25

# 二、获取每一只基金的信息
# 1、获取单只基金信息
print('小伙子不要慌，再等一下')
count = 1
for fund in type2_funds_list:
# 获取每一只基金的 详细信息

# 独立规则
# 基金名称
type2_fund_name_rules = re.compile(' title="(.*?)" target')
type2_fund_name = type2_fund_name_rules.findall(fund)
# print(count,type2_fund_name)

# 其它基金信息规则
type2_fund_other_info_rule = re.compile('<td height="30">(.*?)</td>')
type2_fund_other_info_list = type2_fund_other_info_rule.findall(fund)
# print(type2_fund_other_info_list[1:11])# 注意，名字在上面 因为规则的问题，只取1:11个数据

# 三、组织数据结构
# 导航格式：
3ff7

#   type2_nvg_tltles
#   ['基金简称', '基金代码', '净值日期', '成立日期', '申购状态', '赎回状态', '定投状态', '网上交易', '添加自选']
#   ['（百）万份收益', '七日年化收益率(%)', '最近30天的年化(%)', '今年以来年化(%)]
#   ['基金简称', '基金代码', '净值日期', '成立日期', '申购状态', '赎回状态', '定投状态', '网上交易', '添加自选', '（百）万份收益', '七日年化收益率(%)', '最近30天的年化(%)', '今年以来年化(%)]'
# 内容规则：
# 基金名称：type2_fund_name
# 基金的其它信息：type2_fund_other_info_list
# 注意，名字在上面 [' 002936' 1, '2019-05-24' 2, '0.7764' 3, '2.538'4, '2.625' 5, '2.932' 6, '2017-01-13' 7, '开放' 8, '开放' 9, '开放' 10]

'''
基金简称
基金代码
净值日期
（百）万份收益
七日年化收益率(%)
最近30天的年化(%)
今年以来年化(%)
成立日期
申购状态
赎回状态
定投状态
网上交易
添加自选
'''
type2_funds_detail = {
type2_nvg_tltles[0]:type2_fund_name[0],             #基金简称
type2_nvg_tltles[1]:type2_fund_other_info_list[1],  #基金代码
type2_nvg_tltles[2]:type2_fund_other_info_list[2],  #净值日期
type2_nvg_tltles[3]:type2_fund_other_info_list[7],  #成立日期
type2_nvg_tltles[4]:type2_fund_other_info_list[8],  #申购状态
type2_nvg_tltles[5]:type2_fund_other_info_list[9],  #赎回状态
type2_nvg_tltles[6]:type2_fund_other_info_list[10],  #定投状态
#     type2_nvg_tltles[7]:type2_fund_other_info_list[6],  #网上交易
#     type2_nvg_tltles[8]:type2_fund_other_info_list[6],  #添加自选
#
type2_nvg_tltles[9]:type2_fund_other_info_list[3],  #（百）万份收益
type2_nvg_tltles[10]:type2_fund_other_info_list[4],  #七日年化收益率(%)
type2_nvg_tltles[11]:type2_fund_other_info_list[5],  #最近30天的年化(%)
type2_nvg_tltles[12]:type2_fund_other_info_list[6],  #今年以来年化(%)
}
print(count, type2_funds_detail)
count += 1

'''
#这是typer_3
'''
def typer_3(self,type3):
''''''
'''
基金简称
基金代码
净值日期
万份收益（元）
七日年化收益率(%)
运作期年化收益率(%)
成立日期
申购状态
赎回状态
定投状态
网上交易
添加自选
'''
print('不持久啊')
#一、获取type3基金导航栏信息
# 导航栏存在两种规则！！
# 注意，还有四个nvg规则不同
# ['基金简称', '基金代码', '净值日期', '成立日期', '申购状态', '赎回状态', '定投状态', '网上交易', '添加自选']
# 拼接后的导航栏为
# ['基金简称', '基金代码', '净值日期', '成立日期', '申购状态', '赎回状态', '定投状态', '网上交易', '添加自选', '（百）万份收益', '七日年化收益率(%)', '最近30天的年化(%)', '今年以来年化(%)]'

# 1.获取导航栏信息
type3_nvg_titles_left_rule = re.compile('class="p16_libe">(.*?)</span>')  # 导航规则1
type3_nvg_titles_right_rule = re.compile('style=" line-height:20px;">(.*?)</span></td>')  # 导航规则2
type3_nvg_tltles_left = type3_nvg_titles_left_rule.findall(type3)  # 匹配航规则1结果
type3_nvg_tltles_right = type3_nvg_titles_right_rule.findall(type3)  # 匹配航规则2结果
# print(type3_nvg_tltles_left)  # 基金导航栏列1
# print(type3_nvg_tltles_right)  # 基金导航栏列2

# type3导航栏 顺序与原网页不同，组合数据结构时需要注意
type3_nvg_tltles = type3_nvg_tltles_left + type3_nvg_tltles_right

# print('type3的导航栏为：', type3_nvg_tltles)

# 二、获取type3中 每一只基金列表
# 1.基金列表  funds_list
type3_funds_rule = re.compile('onclick="moveToComPare(.*?)添加</a></td>', re.S)
type3_funds_list = type3_funds_rule.findall(type3)
# print(len(type3_funds_list))#期望： 2  时间2019/5/25

# 2.获取每一只基金的信息
# 1、获取单只基金信息
count = 1
for fund in type3_funds_list:
#     # 获取每一只基金的 详细信息

#     # 独立规则
#     # 基金名称
type3_fund_name_rules = re.compile(' title="(.*?)" target')
type3_fund_name = type3_fund_name_rules.findall(fund)
# print(count,type3_fund_name)

# 其它基金信息规则
type3_fund_other_info_rule = re.compile('<td height="30">(.*?)</td>',re.S)
type3_fund_other_info_list = type3_fund_other_info_rule.findall(fund)
# print(type3_fund_other_info_list[1:10])# 注意，名字在上面 因为规则的原因，只取1:11个数据

# 三、组织数据结构
# 导航格式：
#   type3_nvg_tltles
#   ['基金简称', '基金代码', '净值日期', '成立日期', '申购状态', '赎回状态', '定投状态', '网上交易', '添加自选']
#   ['万份收益（元）', '七日年化收益率(%)', '运作期年化收益率(%) ']
#type3的导航栏为： ['基金简称', '基金代码', '净值日期', '成立日期', '申购状态', '赎回状态', '定投状态', '网上交易', '添加自选', '万份收益（元）', '七日年化收益率(%)', '运作期年化收益率(%) ']
# 内容规则：
# 基金名称：type3_fund_name
# 基金的其它信息：type3_fund_other_info_list
# 注意，名字在上面 [' 001057', '2019-05-24', '0.6489', '2.342', '2012-10-24', '暂停', '开放', '暂停']

type3_funds_detail = {
type3_nvg_tltles[0]: type3_fund_name[0],  # 基金简称
type3_nvg_tltles[1]: type3_fund_other_info_list[1],  # 基金代码
type3_nvg_tltles[2]: type3_fund_other_info_list[2],  # 净值日期

type3_nvg_tltles[3]: type3_fund_other_info_list[6],  # 成立日期
type3_nvg_tltles[4]: type3_fund_other_info_list[7],  # 申购状态
type3_nvg_tltles[5]: type3_fund_other_info_list[8],  # 赎回状态
type3_nvg_tltles[6]: type3_fund_other_info_list[9],  # 定投状态

type3_nvg_tltles[9]: type3_fund_other_info_list[3],  # 万份收益（元）
type3_nvg_tltles[10]: type3_fund_other_info_list[4],  # 七日年化收益率(%)
type3_nvg_tltles[11]: type3_fund_other_info_list[5][0:2],  # 运作期年化收益率(%)
}
print(count, type3_funds_detail)
count += 1
'''
#这是typer_4
'''
def typer_4(self, type4):
'''
需要获取的内容：
1.基金简称
2.基金代码
3.净值日期
4.净值
5.累计净值
6.涨跌幅
7.成立日期
8.申购状态
9.赎回状态
10.定投状态
11.网上交易
:param type4: 这是第四种基金类型
:return:
'''

'''
操作步骤：
1：先获取每一值基金的信息
'''
# 一、整理数据
'''
#创建基金过滤规则
基金简称
基金代码
净值日期
净值
累计净值
成立日期
到期日期
定投状态
交易状态
'''
# 一、获取type4基金导航栏信息
type4_nvg_titles_rule = re.compile('class="p16_libe">(.*?)</span>')
# 基金导航栏字典
# 只需要前9个
#['基金简称', '基金代码', '净值日期', '净值', '累计净值', '成立日期 ', '到期日期', '定投状态', '交易状态', '网上交易', '添加自选']
type4_nvg_tltles = type4_nvg_titles_rule.findall(type4)
print(type4_nvg_tltles)  # 基金导航栏列

# 二、获取type4中 每一只基金列表
# 基金列表  funds_list
funds_rule = re.compile('class="fundId"(.*?)style="color:red', re.S)
funds_list = funds_rule.findall(type4)
# print(len(funds_list))#期望： 26  时间5019/5/25

# 三、获取每一只基金的信息
# 1、获取单只基金信息
count = 1
for fund in funds_list:
# 获取每一只基金的 详细信息

# 独立规则
# 基金名称
type4_fund_name_rules = re.compile(' title="(.*?)" target')
type4_fund_name = type4_fund_name_rules.findall(fund)
# print(count,type4_fund_name)

# 其它规则
fund_other_info_rule = re.compile('<td height="30">(.*?)</td>')
fund_other_info_list = fund_other_info_rule.findall(fund)
# print(count,fund_other_info_list)# 注意，名字在上面 [' 001077', '2016-10-26', '', '', '2013-01-22', '', '---', '', '---', '---']

# 组织数据结构
# 导航格式：
#   type4_nvg_tltles
# ['基金简称', '基金代码', '净值日期', '净值', '累计净值', '成立日期 ', '到期日期', '定投状态', '交易状态', '网上交易', '添加自选']
# 内容规则：
# 基金名称：type4_fund_name
# 基金的其它信息：fund_other_info_list
# 注意，名字在上面 [' 500028', '2006-08-04', '1.0701', '1.0701', '1991-11-15', '2006-08-09', '---', '已退市', '---', '---']
funds_detail = {
type4_nvg_tltles[0]: type4_fund_name[0],        #基金简称
type4_nvg_tltles[1]: fund_other_info_list[0],   #基金代码
type4_nvg_tltles[2]: fund_other_info_list[1],   #净值日期
type4_nvg_tltles[3]: fund_other_info_list[2],   #净值
type4_nvg_tltles[4]: fund_other_info_list[3],   #累计净值
type4_nvg_tltles[5]: fund_other_info_list[4],   #成立日期
type4_nvg_tltles[6]: fund_other_info_list[5],   #到期日期
type4_nvg_tltles[7]: fund_other_info_list[6],   #定投状态
type4_nvg_tltles[8]: fund_other_info_list[7],   #交易状态
}
print(count, funds_detail)
count += 1
print('小伙子挺快的啊')

if __name__ == '__main__':
huaxia = Huaxia()
huaxia()
内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理
标签：
相关文章推荐
新的分享
章节导航