您的位置:首页 > 编程语言 > Python开发

Python 数据可视化编程(一)

2018-03-28 19:18 921 查看

读取CSV文件

#method one
import csv
import sys
filename = 'ch02-data.csv'
data = []

try:
with open(filename) as f:
reader = csv.reader(f)
header = reader.next()
data = [row for row in reader]
except csv.Error as e:
print"error reading csv file at line %s:%s"%(reader.line_num,e)
sys.exit(-1)

if header:
print header
print"========================="
for datarow in data:
print datarow

#method two
import numpy
data = numpy.loadtxt('ch02-data.csv',dtype = 'string',delimiter = ',')#不包含表头


读取Excel文件

import xlrd
file = 'data'
wb = xlrd.open_workbook(filename = file)
ws = wb.sheet_by_name('sheet1')
dataset = []

for r in xrange(ws.nrows):
col = []
for c in range(ws.ncols):
col.append(ws.cell(r,c).value)
dataset.append(col)

from pprint import pprint
pprint(dataset


从定宽数据文件导入数据

import struct
import string
datafile = 'ch02-data-width-1M.data'
mask = '9s14s5s'

with open(datafile,'r') as f:
for line in f:
fields = struct.Struct(mask).unpack_from(line)#unpack_from()逐行读取文件内容并根据指定的格式解析
print'fields:',[field.strip() for field in fields]#strip()去掉前导和后导空格


从制表符分隔的文件读取数据

import csv
import sys
filename = 'ch02-data.tab'
data = []

try:
with open(filename) as f:
reader = csv.reader(f,dialect = csv.excel_tab)
header = reader.next()
data = [row for row in reader]
except csv.Error as e:
print"error reading csv file at line %s:%s"%(reader.line_num,e)
sys.exit(-1)

if header:
print header
print"========================="
for datarow in data:
print datarow

#对脏数据进行清理
datafile = 'data-1.tab'
with open(datafile,'r') as f:
for line in f:
line = line.strip()
print line.split('\t')


从JSON数据源导入数据

"""
从JSON数据源导入数据
pip install requests
"""
import requests
url = 'https://github.com/timeline.json'
r = requests.get(url)#获取远程资源
json_obj = r.json()
repos = set()

for entry in json_obj:
try:
repos.add(entry['repository']['url'])
except KeyError as e:
print "no key %s.skipping......"%(e)

from pprint import pprint
pprint(repos)


导出数据到JSON、CSV、Excel

"""
导出数据到JSON、CSV、Excel
"""
#first,import the moudle we need
import os
import sys
import argparse
try:
import cStringIO as StringIO
except:
import StringIO
import struct
import json
import csv

#second,define the method to 读写数据
def import_data(import_file):
mask = '9s14s5s'
data = []
with open(import_file,'r') as f:
for line in f:
fields = struct.Struct(mask).unpack_from(line)
data.append(list([f.strip() for f in fields]))
return data

def write_data(data,export_format):
if export_format == 'csv':
return write_csv(data)
elif export_format == 'json':
return write_json(data)
elif export_format == 'xlsx':
return write_xlsx(data)
else:
raise Exception('Illegal format defined')

#third,为每种数据格式实现各自的方法
def write_csv(data):
f = StringIO.StringIO()
writer = csv.writer(f)
for row in data:
writer.writerow(row)
return f.getvalue()

def write_json(data):
j = json.dumps(data)
return j

def write_xlsx(data):
from xlwt import Workbook
book = Workbook()
sheet1 = book.add_sheet('Sheet 1')
row = 0
for line in data:
col = 0
for datum in line:
print datum
sheet1.write(row,col,datum)
col += 1
row += 1
if row > 65535:
print >> sys.stderr
break
f = StringIO.StringIO()
book.save(f)
return f.getvalue()

#forth,main入口
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('import-file',help = 'Path to a fixed-width data file.')
parser.add_argument('export_format',help = 'Export format: json, csv, xlsx')
args = parser.parse_args()
if args.import_file is None:
print >> sys.stderr
sys.exit(1)
if args.export_format not in ('csv', 'xlsx', 'json'):
print >> sys.stderr
sys.exit(1)
if not os.path.isfile(args.import_file):
print >> sys.stderr
sys.exit(1)
datda = import_data(args.import_file)
print write_data(data,args.export_format)


从HTML中导入数据

# 从HTML中导入数据
from lxml.html import parse
from urllib2 import urlopen
parsed = parse(urlopen("https://finance.yahoo.com/q/op?s=AAPL+Options"))
# 找到文档中的表格,并将其导入。
doc = parsed.getroot()
table = doc.findall(".//table")
# 然后选择一个表格做测试。
put = table[1]
# 对于一个表格来说,有一个标题和数据。在HTML中th单元格就表示标题行,td则表示数据行。
def _unpack(row, kind="td"):
elts = row.findall(".//%s" % kind)
return [val.text_content() for val in elts]
# 同时,在导入数据表格时,应该考虑到文本类型。我们使用pandas中的TextParser类自动类型转换。
from pandas.io.parsers import TextParser
def parse_options_data(table):
rows = table.findall(".//tr")
header = _unpack(rows[0], kind="th")
data = [_unpack(r) for r in rows[1:]]
return TextParser(data, names=header).get_chunk()
# 最后对这个表格调用该解析函数
put_data = parse_options_data(put)
print put_data[:10]
# 同时,我们也可以获取文档的全部URL
# 链接的标签是a。
links = doc.findall(".//a")
# print links[15:20]
# 得到一个链接的URL和文本内容分别使用,get()和text_content()方法
urls = [lnk.get("href") for lnk in links]
text = [lnk.text_content() for lnk in links]
from pprint import pprint
pprint(urls[:10])
print “============”


从数据库导入数据

"""
从数据库导入数据
pip install sqlite3
"""
#first,吧SQL文件导入到SQLite数据库中
import sqlite3
import sys
if len(sys.argv) < 2:
print'error: you must supply at least SQL script'
print'usage: %s tabel.db ./sql-dump.sql'%(sys.argv[0])
sys.exit(1)
script_path = sys.argv[1]
if len(sys.argv) == 3:
db = sys.argv[2]
else:
db = ':memory:'
try:
con = sqlite3.connect(db)
with con:
cur = con.cursor()
with open(script_path,'rb') as f:
cur.executescript(f.read())
except sqlite3.Error as err:
print'Error occured: %s'%err

#从数据库文件读取数据的代码
import sqlite3
import sys
if len(sys.argv) != 2:
print"please specify database file.'
sys.exit(1)
db = sys.argv[1]
try:
con = sqlite3.connect(db)
with con:
cur = con.cursor()
query = 'SELECT ID,Name,Population FROM City ORDER BY Population DESC LIMIT 1000'
con.text_factory = str
cur.execute(query)
resultset = cur.fetchall()
col_names = [cn[0] for cn in cur.description]
print'%10s %30s
a254
%10s'%tuple(col_names)
print'='*(10+1+30+1+10)
for row in resultset:
print'%10s %30s %10s'%row
except sqlite3.Error as err:
print'[ERROR]:',err
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: