利用python进入数据分析之数据加载、存储、文件格式
2017-10-21 11:06
726 查看
相关数据测试文件,请到此处下载:
http://download.csdn.net/download/u013584315/10014865
导入相关包
In [44]:from __future__ import division from numpy.random import randn import numpy as np import os import sys import matplotlib.pyplot as plt np.random.seed(12345) plt.rc('figure', figsize=(10, 6)) from pandas import Series, DataFrame import pandas as pd np.set_printoptions(precision=4)
读写文本格式的数据
In [4]:df = pd.read_csv('ex1.csv') df
Out[4]:
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
pd.read_table('ex1.csv', sep=',')
Out[5]:
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
pd.read_csv('ex2.csv', header=None)
Out[6]:
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message'])
Out[7]:
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
names = ['a', 'b', 'c', 'd', 'message'] pd.read_csv('ex2.csv', names=names, index_col='message') # message 作为DF的索引
Out[8]:
a | b | c | d | |
---|---|---|---|---|
message | ||||
hello | 1 | 2 | 3 | 4 |
world | 5 | 6 | 7 | 8 |
foo | 9 | 10 | 11 | 12 |
parsed = pd.read_csv('csv_mindex.csv', index_col=['key1', 'key2']) parsed
Out[9]:
value1 | value2 | ||
---|---|---|---|
key1 | key2 | ||
one | a | 1 | 2 |
b | 3 | 4 | |
c | 5 | 6 | |
d | 7 | 8 | |
two | a | 9 | 10 |
b | 11 | 12 | |
c | 13 | 14 | |
d | 15 | 16 |
list(open('ex3.txt'))
Out[10]:
[' A B C\n', 'aaa -0.264438 -1.026059 -0.619500\n', 'bbb 0.927272 0.302904 -0.032399\n', 'ccc -0.264273 -0.386314 -0.217601\n', 'ddd -0.871858 -0.348382 1.100491\n']
In [11]:
result = pd.read_table('ex3.txt', sep='\s+') result
Out[11]:
A | B | C | |
---|---|---|---|
aaa | -0.264438 | -1.026059 | -0.619500 |
bbb | 0.927272 | 0.302904 | -0.032399 |
ccc | -0.264273 | -0.386314 | -0.217601 |
ddd | -0.871858 | -0.348382 | 1.100491 |
pd.read_csv('ex4.csv', skiprows=[0, 2, 3]) # skiprows 跳过数据
Out[12]:
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
result = pd.read_csv('ex5.csv') result
Out[14]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3.0 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11.0 | 12 | foo |
pd.isnull(result) #判断是否为空值
Out[15]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | False | False | False | False | False | True |
1 | False | False | False | True | False | False |
2 | False | False | False | False | False | False |
result = pd.read_csv('ex5.csv',na_values=['NULL']) # na_values将某个数值变成空值
Out[33]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | NaN | 1 | 2 | 3.0 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11.0 | 12 | foo |
sentinels = {'message': ['foo'], 'something': ['two']} pd.read_csv('ex5.csv', na_values=sentinels) # na_values 可以接受一个字典
Out[34]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3.0 | 4 | NaN |
1 | NaN | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11.0 | 12 | NaN |
逐块读取文件
In [35]:result = pd.read_csv('ex6.csv') result
Out[35]:
one | two | three | four | key | |
---|---|---|---|---|---|
0 | 0.467976 | -0.038649 | -0.295344 | -1.824726 | L |
1 | -0.358893 | 1.404453 | 0.704965 | -0.200638 | B |
2 | -0.501840 | 0.659254 | -0.421691 | -0.057688 | G |
3 | 0.204886 | 1.074134 | 1.388361 | -0.982404 | R |
4 | 0.354628 | -0.133116 | 0.283763 | -0.837063 | Q |
5 | 1.817480 | 0.742273 | 0.419395 | -2.251035 | Q |
6 | -0.776764 | 0.935518 | -0.332872 | -1.875641 | U |
7 | -0.913135 | 1.530624 | -0.572657 | 0.477252 | K |
8 | 0.358480 | -0.497572 | -0.367016 | 0.507702 | S |
9 | -1.740877 | -1.160417 | -1.637830 | 2.172201 | G |
10 | 0.240564 | -0.328249 | 1.252155 | 1.072796 | 8 |
11 | 0.764018 | 1.165476 | -0.639544 | 1.495258 | R |
12 | 0.571035 | -0.310537 | 0.582437 | -0.298765 | 1 |
13 | 2.317658 | 0.430710 | -1.334216 | 0.199679 | P |
14 | 1.547771 | -1.119753 | -2.277634 | 0.329586 | J |
15 | -1.310608 | 0.401719 | -1.000987 | 1.156708 | E |
16 | -0.088496 | 0.634712 | 0.153324 | 0.415335 | B |
17 | -0.018663 | -0.247487 | -1.446522 | 0.750938 | A |
18 | -0.070127 | -1.579097 | 0.120892 | 0.671432 | F |
19 | -0.194678 | -0.492039 | 2.359605 | 0.319810 | H |
20 | -0.248618 | 0.868707 | -0.492226 | -0.717959 | W |
21 | -1.091549 | -0.867110 | -0.647760 | -0.832562 | C |
22 | 0.641404 | -0.138822 | -0.621963 | -0.284839 | C |
23 | 1.216408 | 0.992687 | 0.165162 | -0.069619 | V |
24 | -0.564474 | 0.792832 | 0.747053 | 0.571675 | I |
25 | 1.759879 | -0.515666 | -0.230481 | 1.362317 | S |
26 | 0.126266 | 0.309281 | 0.382820 | -0.239199 | L |
27 | 1.334360 | -0.100152 | -0.840731 | -0.643967 | 6 |
28 | -0.737620 | 0.278087 | -0.053235 | -0.950972 | J |
29 | -1.148486 | -0.986292 | -0.144963 | 0.124362 | Y |
... | ... | ... | ... | ... | ... |
9970 | 0.633495 | -0.186524 | 0.927627 | 0.143164 | 4 |
9971 | 0.308636 | -0.112857 | 0.762842 | -1.072977 | 1 |
9972 | -1.627051 | -0.978151 | 0.154745 | -1.229037 | Z |
9973 | 0.314847 | 0.097989 | 0.199608 | 0.955193 | P |
9974 | 1.666907 | 0.992005 | 0.496128 | -0.686391 | S |
9975 | 0.010603 | 0.708540 | -1.258711 | 0.226541 | K |
9976 | 0.118693 | -0.714455 | -0.501342 | -0.254764 | K |
9977 | 0.302616 | -2.011527 | -0.628085 | 0.768827 | H |
9978 | -0.098572 | 1.769086 | -0.215027 | -0.053076 | A |
9979 | -0.019058 | 1.964994 | 0.738538 | -0.883776 | F |
9980 | -0.595349 | 0.001781 | -1.423355 | -1.458477 | M |
9981 | 1.392170 | -1.396560 | -1.425306 | -0.847535 | H |
9982 | -0.896029 | -0.152287 | 1.924483 | 0.365184 | 6 |
9983 | -2.274642 | -0.901874 | 1.500352 | 0.996541 | N |
9984 | -0.301898 | 1.019906 | 1.102160 | 2.624526 | I |
9985 | -2.548389 | -0.585374 | 1.496201 | -0.718815 | D |
9986 | -0.064588 | 0.759292 | -1.568415 | -0.420933 | E |
9987 | -0.143365 | -1.111760 | -1.815581 | 0.435274 | 2 |
9988 | -0.070412 | -1.055921 | 0.338017 | -0.440763 | X |
9989 | 0.649148 | 0.994273 | -1.384227 | 0.485120 | Q |
9990 | -0.370769 | 0.404356 | -1.051628 | -1.050899 | 8 |
9991 | -0.409980 | 0.155627 | -0.818990 | 1.277350 | W |
9992 | 0.301214 | -1.111203 | 0.668258 | 0.671922 | A |
9993 | 1.821117 | 0.416445 | 0.173874 | 0.505118 | X |
9994 | 0.068804 | 1.322759 | 0.802346 | 0.223618 | H |
9995 | 2.311896 | -0.417070 | -1.409599 | -0.515821 | L |
9996 | -0.479893 | -0.650419 | 0.745152 | -0.646038 | E |
9997 | 0.523331 | 0.787112 | 0.486066 | 1.093156 | K |
9998 | -0.362559 | 0.598894 | -1.843201 | 0.887292 | G |
9999 | -0.096376 | -1.012999 | -0.657431 | -0.573315 | 0 |
In [36]:
pd.read_csv('ex6.csv', nrows=5) # 只读取前5行
Out[36]:
one | two | three | four | key | |
---|---|---|---|---|---|
0 | 0.467976 | -0.038649 | -0.295344 | -1.824726 | L |
1 | -0.358893 | 1.404453 | 0.704965 | -0.200638 | B |
2 | -0.501840 | 0.659254 | -0.421691 | -0.057688 | G |
3 | 0.204886 | 1.074134 | 1.388361 | -0.982404 | R |
4 | 0.354628 | -0.133116 | 0.283763 | -0.837063 | Q |
chunker = pd.read_csv('ex6.csv', chunksize=1000) chunker
Out[37]:
<pandas.io.parsers.TextFileReader at 0x8c6d190>
In [39]:
chunker = pd.read_csv('ex6.csv', chunksize=1000) tot = Series([]) for piece in chunker: tot = tot.add(piece['key'].value_counts(), fill_value=0) tot = tot.sort_values(ascending=False)
In [40]:
tot[:10]
Out[40]:
E 368.0 X 364.0 L 346.0 O 343.0 Q 340.0 M 338.0 J 337.0 F 335.0 K 334.0 H 330.0 dtype: float64
将数据写到文本文件
In [41]:data = pd.read_csv('ex5.csv') data
Out[41]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3.0 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11.0 | 12 | foo |
data.to_csv('out.csv')
In [45]:
data.to_csv(sys.stdout, sep='|')# 分隔符用'|'
|something|a|b|c|d|message 0|one|1|2|3.0|4| 1|two|5|6||8|world 2|three|9|10|11.0|12|foo
In [46]:
data.to_csv(sys.stdout, na_rep='NULL')# 缺失数据用NULL代替
,something,a,b,c,d,message 0,one,1,2,3.0,4,NULL 1,two,5,6,NULL,8,world 2,three,9,10,11.0,12,foo
In [47]:
data.to_csv(sys.stdout, index=False, header=False) #禁用行和列标签
one,1,2,3.0,4, two,5,6,,8,world three,9,10,11.0,12,foo
In [48]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])# 指定部分列和顺序
a,b,c 1,2,3.0 5,6, 9,10,11.0
In [49]:
dates = pd.date_range('1/1/2000', periods=7) ts = Series(np.arange(7), index=dates) ts.to_csv('tseries.csv')
In [50]:
Series.from_csv('tseries.csv', parse_dates=True)
Out[50]:
2000-01-01 0 2000-01-02 1 2000-01-03 2 2000-01-04 3 2000-01-05 4 2000-01-06 5 2000-01-07 6 dtype: int64
手工处理分隔符格式
In [51]:import csv f = open('ex7.csv') reader = csv.reader(f)
In [52]:
for line in reader: print(line)
['a', 'b', 'c'] ['1', '2', '3'] ['1', '2', '3', '4']
In [55]:
lines = list(csv.reader(open('ex7.csv'))) lines
Out[55]:
[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3', '4']]
In [57]:
header, values = lines[0], lines[1:] header
Out[57]:
['a', 'b', 'c']
In [58]:
values
Out[58]:
[['1', '2', '3'], ['1', '2', '3', '4']]
In [63]:
zip(header, zip(*values))
Out[63]:
[('a', ('1', '1')), ('b', ('2', '2')), ('c', ('3', '3'))]
In [65]:
data_dict = {h: v for h, v in zip(header, zip(*values))} data_dict
Out[65]:
{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}
In [66]:
class my_dialect(csv.Dialect): lineterminator = '\n' delimiter = ';' quotechar = '"' quoting = csv.QUOTE_MINIMAL
In [67]:
with open('mydata.csv', 'w') as f: writer = csv.writer(f, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9'))
JSON数据
In [69]:obj = """ {"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}] } """
In [70]:
import json result = json.loads(obj) result
Out[70]:
{u'name': u'Wes', u'pet': None, u'places_lived': [u'United States', u'Spain', u'Germany'], u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'}, {u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}
In [71]:
asjson = json.dumps(result)
In [72]:
siblings = DataFrame(result['siblings'], columns=['name', 'age']) siblings
Out[72]:
name | age | |
---|---|---|
0 | Scott | 25 |
1 | Katie | 33 |
XML和HTML数据
In [73]:from lxml.html import parse from urllib2 import urlopen parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options')) doc = parsed.getroot()
In [74]:
links = doc.findall('.//a') links[15:20]
Out[74]:
[<Element a at 0x8f60d50>, <Element a at 0x8f60d80>, <Element a at 0x8f60db0>, <Element a at 0x8f60de0>, <Element a at 0x8f60e10>]
In [75]:
lnk = links[28] lnk lnk.get('href') lnk.text_content()
Out[75]:
'139.00'
二进制数据格式
In [88]:frame = pd.read_csv('ex1.csv') frame
Out[88]:
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
frame.to_pickle('frame_pickle')
In [90]:
pd.read_pickle('frame_pickle')
Out[90]:
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
使用HDF5格式
In [91]:store = pd.HDFStore('mydata.h5') store['obj1'] = frame store['obj1_col'] = frame['a'] store
Out[91]:
<class 'pandas.io.pytables.HDFStore'> File path: mydata.h5 /obj1 frame (shape->[3,5]) /obj1_col series (shape->[3])
In [92]:
store['obj1']
Out[92]:
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
store.close() os.remove('mydata.h5')
使用HTML和Web API
In [94]:import requests url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels' resp = requests.get(url) resp
Out[94]:
<Response [200]>
In [95]:
data[:5]
Out[95]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3.0 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11.0 | 12 | foo |
issue_labels = DataFrame(data) issue_labels
Out[96]:
something | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | one | 1 | 2 | 3.0 | 4 | NaN |
1 | two | 5 | 6 | NaN | 8 | world |
2 | three | 9 | 10 | 11.0 | 12 | foo |
使用数据库
In [97]:import sqlite3 query = """ CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER );""" con = sqlite3.connect(':memory:') con.execute(query) # 创建表 con.commit() # 发出命令
In [98]:
data = [('Atlanta', 'Georgia', 1.25, 6), ('Tallahassee', 'Florida', 2.6, 3), ('Sacramento', 'California', 1.7, 5)] stmt = "INSERT INTO test VALUES(?, ?, ?, ?)" con.executemany(stmt, data) # 插入数据 con.commit()
In [99]:
cursor = con.execute('select * from test') #查询 rows = cursor.fetchall() rows
Out[99]:
[(u'Atlanta', u'Georgia', 1.25, 6), (u'Tallahassee', u'Florida', 2.6, 3), (u'Sacramento', u'California', 1.7, 5)]
In [100]:
cursor.description
Out[100]:
(('a', None, None, None, None, None, None), ('b', None, None, None, None, None, None), ('c', None, None, None, None, None, None), ('d', None, None, None, None, None, None))
In [101]:
DataFrame(rows, columns=zip(*cursor.description)[0]) # 增加列名
Out[101]:
a | b | c | d | |
---|---|---|---|---|
0 | Atlanta | Georgia | 1.25 | 6 |
1 | Tallahassee | Florida | 2.60 | 3 |
2 | Sacramento | California | 1.70 | 5 |
import pandas.io.sql as sql sql.read_sql('select * from test', con) # 简单的归整数据方法(调用pandas.io.sql)
Out[102]:
a | b | c | d | |
---|---|---|---|---|
0 | Atlanta | Georgia | 1.25 | 6 |
1 | Tallahassee | Florida | 2.60 | 3 |
2 | Sacramento | California | 1.70 | 5 |
相关文章推荐
- 利用python进行数据分析-数据加载、存储与文件格式1
- 利用python进行数据分析之数据加载存储与文件格式
- 利用python进行数据分析-数据加载、存储与文件格式2
- 数据加载、存储与文件格式 利用Python进行数据分析 第6章
- 利用Python进行数据分析--数据加载、存储与文件格式
- Python 数据分析(一) 本实验将学习 pandas 基础,数据加载、存储与文件格式,数据规整化,绘图和可视化的知识
- Python 数据分析(一) 本实验将学习 pandas 基础,数据加载、存储与文件格式,数据规整化,绘图和可视化的知识
- python数据分析之:数据加载,存储与文件格式
- 利用python进行数据分析(四):数据加载、存储
- python数据分析 第7天 数据的加载,存储,与文件格式
- Python中数据加载、存储与文件格式
- Python -- 数据加载、存储与文件格式
- Python数据加载、存储与文件格式
- Python中数据加载、存储与文件格式
- 【Python数据分析与展示】(七)数据加载存储和文件格式
- Python中数据加载、存储与文件格式
- 利用python进入数据分析之MovieLens_1M数据分析
- 【利用Python进行数据分析——经验篇3】如何操作DataFrame中的列的数据格式(转为百分数、保留4位小数)
- 利用python抓取搜狗关于数据分析的文章并保存到csv文件
- 利用python进入数据分析之准备篇