您的位置:首页 > 编程语言 > Python开发

利用python进入数据分析之数据加载、存储、文件格式

2017-10-21 11:06 155 查看


相关数据测试文件,请到此处下载:

http://download.csdn.net/download/u013584315/10014865


导入相关包

In [44]:

from __future__ import division
from numpy.random import randn
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas as pd
np.set_printoptions(precision=4)



读写文本格式的数据

In [4]:

df = pd.read_csv('ex1.csv')
df


Out[4]:

abcdmessage
01234hello
15678world
29101112foo
In [5]:

pd.read_table('ex1.csv', sep=',')


Out[5]:

abcdmessage
01234hello
15678world
29101112foo
In [6]:

pd.read_csv('ex2.csv', header=None)


Out[6]:

01234
01234hello
15678world
29101112foo
In [7]:

pd.read_csv('ex2.csv', names=['a', 'b', 'c', 'd', 'message'])


Out[7]:

abcdmessage
01234hello
15678world
29101112foo
In [8]:

names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ex2.csv', names=names, index_col='message') # message 作为DF的索引


Out[8]:

abcd
message
hello1234
world5678
foo9101112
In [9]:

parsed = pd.read_csv('csv_mindex.csv', index_col=['key1', 'key2'])
parsed


Out[9]:

value1value2
key1key2
onea12
b34
c56
d78
twoa910
b1112
c1314
d1516
In [10]:

list(open('ex3.txt'))


Out[10]:

['            A         B         C\n',
'aaa -0.264438 -1.026059 -0.619500\n',
'bbb  0.927272  0.302904 -0.032399\n',
'ccc -0.264273 -0.386314 -0.217601\n',
'ddd -0.871858 -0.348382  1.100491\n']


In [11]:

result = pd.read_table('ex3.txt', sep='\s+')
result


Out[11]:

ABC
aaa-0.264438-1.026059-0.619500
bbb0.9272720.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.3483821.100491
In [12]:

pd.read_csv('ex4.csv', skiprows=[0, 2, 3]) # skiprows 跳过数据


Out[12]:

abcdmessage
01234hello
15678world
29101112foo
In [14]:

result = pd.read_csv('ex5.csv')
result


Out[14]:

somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
In [15]:

pd.isnull(result) #判断是否为空值


Out[15]:

somethingabcdmessage
0FalseFalseFalseFalseFalseTrue
1FalseFalseFalseTrueFalseFalse
2FalseFalseFalseFalseFalseFalse
In [33]:

result = pd.read_csv('ex5.csv',na_values=['NULL']) # na_values将某个数值变成空值


Out[33]:

somethingabcdmessage
0NaN123.04NaN
1two56NaN8world
2three91011.012foo
In [34]:

sentinels = {'message': ['foo'], 'something': ['two']}
pd.read_csv('ex5.csv', na_values=sentinels) # na_values 可以接受一个字典


Out[34]:

somethingabcdmessage
0one123.04NaN
1NaN56NaN8world
2three91011.012NaN


逐块读取文件

In [35]:

result = pd.read_csv('ex6.csv')
result


Out[35]:

onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q
51.8174800.7422730.419395-2.251035Q
6-0.7767640.935518-0.332872-1.875641U
7-0.9131351.530624-0.5726570.477252K
80.358480-0.497572-0.3670160.507702S
9-1.740877-1.160417-1.6378302.172201G
100.240564-0.3282491.2521551.0727968
110.7640181.165476-0.6395441.495258R
120.571035-0.3105370.582437-0.2987651
132.3176580.430710-1.3342160.199679P
141.547771-1.119753-2.2776340.329586J
15-1.3106080.401719-1.0009871.156708E
16-0.0884960.6347120.1533240.415335B
17-0.018663-0.247487-1.4465220.750938A
18-0.070127-1.5790970.1208920.671432F
19-0.194678-0.4920392.3596050.319810H
20-0.2486180.868707-0.492226-0.717959W
21-1.091549-0.867110-0.647760-0.832562C
220.641404-0.138822-0.621963-0.284839C
231.2164080.9926870.165162-0.069619V
24-0.5644740.7928320.7470530.571675I
251.759879-0.515666-0.2304811.362317S
260.1262660.3092810.382820-0.239199L
271.334360-0.100152-0.840731-0.6439676
28-0.7376200.278087-0.053235-0.950972J
29-1.148486-0.986292-0.1449630.124362Y
..................
99700.633495-0.1865240.9276270.1431644
99710.308636-0.1128570.762842-1.0729771
9972-1.627051-0.9781510.154745-1.229037Z
99730.3148470.0979890.1996080.955193P
99741.6669070.9920050.496128-0.686391S
99750.0106030.708540-1.2587110.226541K
99760.118693-0.714455-0.501342-0.254764K
99770.302616-2.011527-0.6280850.768827H
9978-0.0985721.769086-0.215027-0.053076A
9979-0.0190581.9649940.738538-0.883776F
9980-0.5953490.001781-1.423355-1.458477M
99811.392170-1.396560-1.425306-0.847535H
9982-0.896029-0.1522871.9244830.3651846
9983-2.274642-0.9018741.5003520.996541N
9984-0.3018981.0199061.1021602.624526I
9985-2.548389-0.5853741.496201-0.718815D
9986-0.0645880.759292-1.568415-0.420933E
9987-0.143365-1.111760-1.8155810.4352742
9988-0.070412-1.0559210.338017-0.440763X
99890.6491480.994273-1.3842270.485120Q
9990-0.3707690.404356-1.051628-1.0508998
9991-0.4099800.155627-0.8189901.277350W
99920.301214-1.1112030.6682580.671922A
99931.8211170.4164450.1738740.505118X
99940.0688041.3227590.8023460.223618H
99952.311896-0.417070-1.409599-0.515821L
9996-0.479893-0.6504190.745152-0.646038E
99970.5233310.7871120.4860661.093156K
9998-0.3625590.598894-1.8432010.887292G
9999-0.096376-1.012999-0.657431-0.5733150
10000 rows × 5 columns

In [36]:

pd.read_csv('ex6.csv', nrows=5) # 只读取前5行


Out[36]:

onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q
In [37]:

chunker = pd.read_csv('ex6.csv', chunksize=1000)
chunker


Out[37]:

<pandas.io.parsers.TextFileReader at 0x8c6d190>


In [39]:

chunker = pd.read_csv('ex6.csv', chunksize=1000)

tot = Series([])
for piece in chunker:
tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)


In [40]:

tot[:10]


Out[40]:

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64



将数据写到文本文件

In [41]:

data = pd.read_csv('ex5.csv')
data


Out[41]:

somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
In [42]:

data.to_csv('out.csv')


In [45]:

data.to_csv(sys.stdout, sep='|')# 分隔符用'|'


|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [46]:

data.to_csv(sys.stdout, na_rep='NULL')# 缺失数据用NULL代替


,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [47]:

data.to_csv(sys.stdout, index=False, header=False) #禁用行和列标签


one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [48]:

data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])# 指定部分列和顺序


a,b,c
1,2,3.0
5,6,
9,10,11.0


In [49]:

dates = pd.date_range('1/1/2000', periods=7)
ts = Series(np.arange(7), index=dates)
ts.to_csv('tseries.csv')


In [50]:

Series.from_csv('tseries.csv', parse_dates=True)


Out[50]:

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
dtype: int64



手工处理分隔符格式

In [51]:

import csv
f = open('ex7.csv')
reader = csv.reader(f)


In [52]:

for line in reader:
print(line)


['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']


In [55]:

lines = list(csv.reader(open('ex7.csv')))
lines


Out[55]:

[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3', '4']]


In [57]:

header, values = lines[0], lines[1:]
header


Out[57]:

['a', 'b', 'c']


In [58]:

values


Out[58]:

[['1', '2', '3'], ['1', '2', '3', '4']]


In [63]:

zip(header, zip(*values))


Out[63]:

[('a', ('1', '1')), ('b', ('2', '2')), ('c', ('3', '3'))]


In [65]:

data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict


Out[65]:

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}


In [66]:

class my_dialect(csv.Dialect):
lineterminator = '\n'
delimiter = ';'
quotechar = '"'
quoting = csv.QUOTE_MINIMAL


In [67]:

with open('mydata.csv', 'w') as f:
writer = csv.writer(f, dialect=my_dialect)
writer.writerow(('one', 'two', 'three'))
writer.writerow(('1', '2', '3'))
writer.writerow(('4', '5', '6'))
writer.writerow(('7', '8', '9'))



JSON数据

In [69]:

obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
{"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""


In [70]:

import json
result = json.loads(obj)
result


Out[70]:

{u'name': u'Wes',
u'pet': None,
u'places_lived': [u'United States', u'Spain', u'Germany'],
u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'},
{u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}


In [71]:

asjson = json.dumps(result)


In [72]:

siblings = DataFrame(result['siblings'], columns=['name', 'age'])
siblings


Out[72]:

nameage
0Scott25
1Katie33


XML和HTML数据

In [73]:

from lxml.html import parse
from urllib2 import urlopen

parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))

doc = parsed.getroot()


In [74]:

links = doc.findall('.//a')
links[15:20]


Out[74]:

[<Element a at 0x8f60d50>,
<Element a at 0x8f60d80>,
<Element a at 0x8f60db0>,
<Element a at 0x8f60de0>,
<Element a at 0x8f60e10>]


In [75]:

lnk = links[28]
lnk
lnk.get('href')
lnk.text_content()


Out[75]:

'139.00'



二进制数据格式

In [88]:

frame = pd.read_csv('ex1.csv')
frame


Out[88]:

abcdmessage
01234hello
15678world
29101112foo
In [89]:

frame.to_pickle('frame_pickle')


In [90]:

pd.read_pickle('frame_pickle')


Out[90]:

abcdmessage
01234hello
15678world
29101112foo


使用HDF5格式

In [91]:

store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store


Out[91]:

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5
/obj1                frame        (shape->[3,5])
/obj1_col            series       (shape->[3])


In [92]:

store['obj1']


Out[92]:

abcdmessage
01234hello
15678world
29101112foo
In [93]:

store.close()
os.remove('mydata.h5')



使用HTML和Web API

In [94]:

import requests
url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'
resp = requests.get(url)
resp


Out[94]:

<Response [200]>


In [95]:

data[:5]


Out[95]:

somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
In [96]:

issue_labels = DataFrame(data)
issue_labels


Out[96]:

somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo


使用数据库

In [97]:

import sqlite3

query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
c REAL,        d INTEGER
);"""

con = sqlite3.connect(':memory:')
con.execute(query) # 创建表
con.commit() # 发出命令


In [98]:

data = [('Atlanta', 'Georgia', 1.25, 6),
('Tallahassee', 'Florida', 2.6, 3),
('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"

con.executemany(stmt, data) # 插入数据
con.commit()


In [99]:

cursor = con.execute('select * from test') #查询
rows = cursor.fetchall()
rows


Out[99]:

[(u'Atlanta', u'Georgia', 1.25, 6),
(u'Tallahassee', u'Florida', 2.6, 3),
(u'Sacramento', u'California', 1.7, 5)]


In [100]:

cursor.description


Out[100]:

(('a', None, None, None, None, None, None),
('b', None, None, None, None, None, None),
('c', None, None, None, None, None, None),
('d', None, None, None, None, None, None))


In [101]:

DataFrame(rows, columns=zip(*cursor.description)[0]) # 增加列名


Out[101]:

abcd
0AtlantaGeorgia1.256
1TallahasseeFlorida2.603
2SacramentoCalifornia1.705
In [102]:

import pandas.io.sql as sql
sql.read_sql('select * from test', con) # 简单的归整数据方法(调用pandas.io.sql)


Out[102]:

abcd
0AtlantaGeorgia1.256
1TallahasseeFlorida2.603
2SacramentoCalifornia1.705
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  python numpy pandas 数据