您的位置：首页 > 编程语言 > Python开发

利用python进入数据分析之数据加载、存储、文件格式

2017-10-21 11:06 726 查看

http://download.csdn.net/download/u013584315/10014865

导入相关包

In [44]:

from __future__ import division
from numpy.random import randn
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas as pd
np.set_printoptions(precision=4)

读写文本格式的数据

In [4]:

df = pd.read_csv('ex1.csv')
df

Out[4]:


	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [5]:

pd.read_table('ex1.csv', sep=',')

Out[5]:


	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [6]:

pd.read_csv('ex2.csv', header=None)

Out[6]:


	0	1	2	3	4
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [7]:

pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Out[7]:


	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [8]:

names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ex2.csv', names=names, index_col='message') # message 作为DF的索引

Out[8]:


	a	b	c	d
message
hello	1	2	3	4
world	5	6	7	8
foo	9	10	11	12

In [9]:

parsed = pd.read_csv('csv_mindex.csv', index_col=['key1', 'key2'])
parsed

Out[9]:


		value1	value2
key1	key2
one	a	1	2
b	3	4
c	5	6
d	7	8
two	a	9	10
b	11	12
c	13	14
d	15	16

In [10]:

list(open('ex3.txt'))

Out[10]:

['            A         B         C\n',
'aaa -0.264438 -1.026059 -0.619500\n',
'bbb  0.927272  0.302904 -0.032399\n',
'ccc -0.264273 -0.386314 -0.217601\n',
'ddd -0.871858 -0.348382  1.100491\n']

In [11]:

result = pd.read_table('ex3.txt', sep='\s+')
result

Out[11]:


	A	B	C
aaa	-0.264438	-1.026059	-0.619500
bbb	0.927272	0.302904	-0.032399
ccc	-0.264273	-0.386314	-0.217601
ddd	-0.871858	-0.348382	1.100491

In [12]:

pd.read_csv('ex4.csv', skiprows=[0, 2, 3]) # skiprows 跳过数据

Out[12]:


	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [14]:

result = pd.read_csv('ex5.csv')
result

Out[14]:


	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

In [15]:

pd.isnull(result) #判断是否为空值

Out[15]:


	something	a	b	c	d	message
0	False	False	False	False	False	True
1	False	False	False	True	False	False
2	False	False	False	False	False	False

In [33]:

result = pd.read_csv('ex5.csv',na_values=['NULL']) # na_values将某个数值变成空值

Out[33]:


	something	a	b	c	d	message
0	NaN	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

In [34]:

sentinels = {'message': ['foo'], 'something': ['two']}
pd.read_csv('ex5.csv', na_values=sentinels) # na_values 可以接受一个字典

Out[34]:


	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	NaN	5	6	NaN	8	world
2	three	9	10	11.0	12	NaN

逐块读取文件

In [35]:

result = pd.read_csv('ex6.csv')
result

Out[35]:


	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q
5	1.817480	0.742273	0.419395	-2.251035	Q
6	-0.776764	0.935518	-0.332872	-1.875641	U
7	-0.913135	1.530624	-0.572657	0.477252	K
8	0.358480	-0.497572	-0.367016	0.507702	S
9	-1.740877	-1.160417	-1.637830	2.172201	G
10	0.240564	-0.328249	1.252155	1.072796	8
11	0.764018	1.165476	-0.639544	1.495258	R
12	0.571035	-0.310537	0.582437	-0.298765	1
13	2.317658	0.430710	-1.334216	0.199679	P
14	1.547771	-1.119753	-2.277634	0.329586	J
15	-1.310608	0.401719	-1.000987	1.156708	E
16	-0.088496	0.634712	0.153324	0.415335	B
17	-0.018663	-0.247487	-1.446522	0.750938	A
18	-0.070127	-1.579097	0.120892	0.671432	F
19	-0.194678	-0.492039	2.359605	0.319810	H
20	-0.248618	0.868707	-0.492226	-0.717959	W
21	-1.091549	-0.867110	-0.647760	-0.832562	C
22	0.641404	-0.138822	-0.621963	-0.284839	C
23	1.216408	0.992687	0.165162	-0.069619	V
24	-0.564474	0.792832	0.747053	0.571675	I
25	1.759879	-0.515666	-0.230481	1.362317	S
26	0.126266	0.309281	0.382820	-0.239199	L
27	1.334360	-0.100152	-0.840731	-0.643967	6
28	-0.737620	0.278087	-0.053235	-0.950972	J
29	-1.148486	-0.986292	-0.144963	0.124362	Y
...	...	...	...	...	...
9970	0.633495	-0.186524	0.927627	0.143164	4
9971	0.308636	-0.112857	0.762842	-1.072977	1
9972	-1.627051	-0.978151	0.154745	-1.229037	Z
9973	0.314847	0.097989	0.199608	0.955193	P
9974	1.666907	0.992005	0.496128	-0.686391	S
9975	0.010603	0.708540	-1.258711	0.226541	K
9976	0.118693	-0.714455	-0.501342	-0.254764	K
9977	0.302616	-2.011527	-0.628085	0.768827	H
9978	-0.098572	1.769086	-0.215027	-0.053076	A
9979	-0.019058	1.964994	0.738538	-0.883776	F
9980	-0.595349	0.001781	-1.423355	-1.458477	M
9981	1.392170	-1.396560	-1.425306	-0.847535	H
9982	-0.896029	-0.152287	1.924483	0.365184	6
9983	-2.274642	-0.901874	1.500352	0.996541	N
9984	-0.301898	1.019906	1.102160	2.624526	I
9985	-2.548389	-0.585374	1.496201	-0.718815	D
9986	-0.064588	0.759292	-1.568415	-0.420933	E
9987	-0.143365	-1.111760	-1.815581	0.435274	2
9988	-0.070412	-1.055921	0.338017	-0.440763	X
9989	0.649148	0.994273	-1.384227	0.485120	Q
9990	-0.370769	0.404356	-1.051628	-1.050899	8
9991	-0.409980	0.155627	-0.818990	1.277350	W
9992	0.301214	-1.111203	0.668258	0.671922	A
9993	1.821117	0.416445	0.173874	0.505118	X
9994	0.068804	1.322759	0.802346	0.223618	H
9995	2.311896	-0.417070	-1.409599	-0.515821	L
9996	-0.479893	-0.650419	0.745152	-0.646038	E
9997	0.523331	0.787112	0.486066	1.093156	K
9998	-0.362559	0.598894	-1.843201	0.887292	G
9999	-0.096376	-1.012999	-0.657431	-0.573315	0

10000 rows × 5 columns

In [36]:

pd.read_csv('ex6.csv', nrows=5) # 只读取前5行

Out[36]:


	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q

In [37]:

chunker = pd.read_csv('ex6.csv', chunksize=1000)
chunker

Out[37]:

<pandas.io.parsers.TextFileReader at 0x8c6d190>

In [39]:

chunker = pd.read_csv('ex6.csv', chunksize=1000)

tot = Series([])
for piece in chunker:
tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)

In [40]:

tot[:10]

Out[40]:

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

将数据写到文本文件

In [41]:

data = pd.read_csv('ex5.csv')
data

Out[41]:


	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

In [42]:

data.to_csv('out.csv')

In [45]:

data.to_csv(sys.stdout, sep='|')# 分隔符用'|'

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo

In [46]:

data.to_csv(sys.stdout, na_rep='NULL')# 缺失数据用NULL代替

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo

In [47]:

data.to_csv(sys.stdout, index=False, header=False) #禁用行和列标签

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo

In [48]:

data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])# 指定部分列和顺序

a,b,c
1,2,3.0
5,6,
9,10,11.0

In [49]:

dates = pd.date_range('1/1/2000', periods=7)
ts = Series(np.arange(7), index=dates)
ts.to_csv('tseries.csv')

In [50]:

Series.from_csv('tseries.csv', parse_dates=True)

Out[50]:

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
dtype: int64

手工处理分隔符格式

In [51]:

import csv
f = open('ex7.csv')
reader = csv.reader(f)

In [52]:

for line in reader:
print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']

In [55]:

lines = list(csv.reader(open('ex7.csv')))
lines

Out[55]:

[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3', '4']]

In [57]:

header, values = lines[0], lines[1:]
header

Out[57]:

['a', 'b', 'c']

In [58]:

values

Out[58]:

[['1', '2', '3'], ['1', '2', '3', '4']]

In [63]:

zip(header, zip(*values))

Out[63]:

[('a', ('1', '1')), ('b', ('2', '2')), ('c', ('3', '3'))]

In [65]:

data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

Out[65]:

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [66]:

class my_dialect(csv.Dialect):
lineterminator = '\n'
delimiter = ';'
quotechar = '"'
quoting = csv.QUOTE_MINIMAL

In [67]:

with open('mydata.csv', 'w') as f:
writer = csv.writer(f, dialect=my_dialect)
writer.writerow(('one', 'two', 'three'))
writer.writerow(('1', '2', '3'))
writer.writerow(('4', '5', '6'))
writer.writerow(('7', '8', '9'))

JSON数据

In [69]:

obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
{"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""

In [70]:

import json
result = json.loads(obj)
result

Out[70]:

{u'name': u'Wes',
u'pet': None,
u'places_lived': [u'United States', u'Spain', u'Germany'],
u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'},
{u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}

In [71]:

asjson = json.dumps(result)

In [72]:

siblings = DataFrame(result['siblings'], columns=['name', 'age'])
siblings

Out[72]:


	name	age
0	Scott	25
1	Katie	33

XML和HTML数据

In [73]:

from lxml.html import parse
from urllib2 import urlopen

parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))

doc = parsed.getroot()

In [74]:

links = doc.findall('.//a')
links[15:20]

Out[74]:

[<Element a at 0x8f60d50>,
<Element a at 0x8f60d80>,
<Element a at 0x8f60db0>,
<Element a at 0x8f60de0>,
<Element a at 0x8f60e10>]

In [75]:

lnk = links[28]
lnk
lnk.get('href')
lnk.text_content()

Out[75]:

'139.00'

二进制数据格式

In [88]:

frame = pd.read_csv('ex1.csv')
frame

Out[88]:


	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [89]:

frame.to_pickle('frame_pickle')

In [90]:

pd.read_pickle('frame_pickle')

Out[90]:


	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

使用HDF5格式

In [91]:

store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store

Out[91]:

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5
/obj1                frame        (shape->[3,5])
/obj1_col            series       (shape->[3])

In [92]:

store['obj1']

Out[92]:


	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

In [93]:

store.close()
os.remove('mydata.h5')

使用HTML和Web API

In [94]:

import requests
url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'
resp = requests.get(url)
resp

Out[94]:

<Response [200]>

In [95]:

data[:5]

Out[95]:


	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

In [96]:

issue_labels = DataFrame(data)
issue_labels

Out[96]:


	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

使用数据库

In [97]:

import sqlite3

query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
c REAL,        d INTEGER
);"""

con = sqlite3.connect(':memory:')
con.execute(query) # 创建表
con.commit() # 发出命令

In [98]:

data = [('Atlanta', 'Georgia', 1.25, 6),
('Tallahassee', 'Florida', 2.6, 3),
('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"

con.executemany(stmt, data) # 插入数据
con.commit()

In [99]:

cursor = con.execute('select * from test') #查询
rows = cursor.fetchall()
rows

Out[99]:

[(u'Atlanta', u'Georgia', 1.25, 6),
(u'Tallahassee', u'Florida', 2.6, 3),
(u'Sacramento', u'California', 1.7, 5)]

In [100]:

cursor.description

Out[100]:

(('a', None, None, None, None, None, None),
('b', None, None, None, None, None, None),
('c', None, None, None, None, None, None),
('d', None, None, None, None, None, None))

In [101]:

DataFrame(rows, columns=zip(*cursor.description)[0]) # 增加列名

Out[101]:


	a	b	c	d
0	Atlanta	Georgia	1.25	6
1	Tallahassee	Florida	2.60	3
2	Sacramento	California	1.70	5

In [102]:

import pandas.io.sql as sql
sql.read_sql('select * from test', con) # 简单的归整数据方法（调用pandas.io.sql）

Out[102]:


	a	b	c	d
0	Atlanta	Georgia	1.25	6
1	Tallahassee	Florida	2.60	3
2	Sacramento	California	1.70	5

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： python numpy pandas 数据

相关文章推荐

新的分享

章节导航