您的位置:首页 > 编程语言 > Python开发

spark+python 机器学习 1 数据探索与展示

2018-01-16 00:00 811 查看
数据来源:http://files.grouplens.org/datasets/movielens/ml-100k.zip

数据显示
import matplotlib
from matplotlib.pyplot import hist
from pyspark import SparkContext
import numpy as np
import matplotlib.pyplot as plt

sc = SparkContext("local", "first Spark app")
user_data = sc.textFile("./ml-100k/u.user")
user_fields = user_data.map(lambda line: line.split("|"))
"""
一下为统计各个列类型的总数,注意:distinct
"""
num_users = user_fields.map(lambda fields: fields[0]).count()
num_genders = user_fields.map(lambda fields:
fields[2]).distinct().count()
num_occupations = user_fields.map(lambda fields:
fields[3]).distinct().count()
num_zipcodes = user_fields.map(lambda fields:
fields[4]).distinct().count()
print("Users: %d, genders: %d, occupations: %d, ZIP codes: %d" % (num_users, num_genders,
num_occupations, num_zipcodes))

"""
绘制直方图,显示各年龄段注册用户分布图。
"""

ages = user_fields.map(lambda x: int(x[1])).collect()
hist(ages, bins=20, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16, 10)
plt.show()
结果:



统计各个职业人
import matplotlib
from matplotlib.pyplot import hist
from pyspark import SparkContext
import numpy as np
import matplotlib.pyplot as plt

sc = SparkContext("local", "first Spark app")
user_data = sc.textFile("./ml-100k/u.user")
user_fields = user_data.map(lambda line: line.split("|"))
"""
一下为统计各个列类型的总数,注意:distinct
"""
num_users = user_fields.map(lambda fields: fields[0]).count()
num_genders = user_fields.map(lambda fields:
fields[2]).distinct().count()
num_occupations = user_fields.map(lambda fields:
fields[3]).distinct().count()
num_zipcodes = user_fields.map(lambda fields:
fields[4]).distinct().count()
print("Users: %d, genders: %d, occupations: %d, ZIP codes: %d" % (num_users, num_genders,
num_occupations, num_zipcodes))
"""
统计所有职业的个数
"""
count_by_occupation = user_fields.map(lambda fields: (fields[3], 1)).reduceByKey(lambda x, y: x + y).collect()
x_axis1 = np.array([c[0] for c in count_by_occupation])
y_axis1 = np.array([c[1] for c in count_by_occupation])
x_axis = x_axis1[np.argsort(y_axis1)]
y_axis = y_axis1[np.argsort(y_axis1)]
pos = np.arange(len(x_axis))
width = 1.0
ax = plt.axes()
ax.set_xticks(pos + (width / 2))
ax.set_xticklabels(x_axis)
plt.bar(pos, y_axis, width, color='lightblue')
plt.xticks(rotation=30)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16, 10)
count_by_occupation2 = user_fields.map(lambda fields: fields[3]).countByValue()
plt.show()



电影信息统计,数据过滤

import matplotlib
from matplotlib.pyplot import hist
from pyspark import SparkContext
import numpy as np
import matplotlib.pyplot as plt

sc = SparkContext("local", "first Spark app")
movie_data = sc.textFile("./ml-100k/u.item")
# 数据格式
print(movie_data.first())
num_movies = movie_data.count()
# 总数据数数目
print(num_movies)

def convert_year(x):
"""
# 若数据缺失年份则将其年份设为1900。
在后续处理中会过滤掉这类数据
"""
try:
return int(x[-4:])
except:
return 1900

def return_line(tuple):
tuple[2] = convert_year(tuple[2])
return tuple

# 过滤不正确的数据
movie_fields = movie_
7fe0
data.map(lambda lines: lines.split("|"))
years = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x))
years_filtered = years.filter(lambda x: x != 1900)
print(years_filtered.first())

movie_fields = movie_data.map(lambda lines: lines.split("|"))
years = movie_fields.map(lambda fields: return_line(fields))
print(years.collect())
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: