数据科学包-Day5

字符串的离散化
数据合并

字符串的离散化

在这里插入图片描述

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path="./datasets_IMDB-Movie-Data.csv"
df=pd.read_csv(file_path)
pd.set_option('display.max_columns',None)
#统计分类的列表
temp_list=df["Genre"].str.split(",").tolist()
genre_list=list(set([i for j in temp_list for i in j]))
#构造全为0的数组
zeros_df=pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
#df.shape[0]代表行的长度，len(genre_list)代表列的长度，columns=genre_list->重新定义列表的名字
#print(zeros_df)
#给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
    zeros_df.loc[i,temp_list[i]]=1

#统计每个分类的电影的数量和
genre_count=zeros_df.sum(axis=0)
print(genre_count)

#排序
genre_count=genre_count.sort_values()
_x=genre_count.index
_y=genre_count.values

#画图
plt.figure(figsize=(20,10),dpi=100)
plt.bar(range(len(_x)),_y,color="red")
plt.xticks(range(len(_x)),_x)
plt.show()

War           13.0
Horror       119.0
Mystery      106.0
Animation     49.0
Family        51.0
Action       303.0
Fantasy      101.0
Crime        150.0
Romance      141.0
Adventure    259.0
History       29.0
Music         16.0
Sport         18.0
Drama        513.0
Thriller     195.0
Western        7.0
Biography     81.0
Musical        5.0
Comedy       279.0
Sci-Fi       120.0
dtype: float64

在这里插入图片描述

数据合并

join:默认情况下他是把行行索引相同的数据合并在一起

In [8]: t=pd.DataFrame(np.arange(12).reshape((3,4)),index=list("ABC"),columns=list("WXYZ"))

In [9]: t
Out[9]:
   W  X   Y   Z
A  0  1   2   3
B  4  5   6   7
C  8  9  10  11
In [17]: t1=pd.DataFrame(np.arange(9).reshape(3,3),index=list("ABC"))
In [19]: t1
Out[19]:
   0  1  2
A  0  1  2
B  3  4  5
C  6  7  8

In [20]: t.join(t1)
Out[20]:
   W  X   Y   Z  0  1  2
A  0  1   2   3  0  1  2
B  4  5   6   7  3  4  5
C  8  9  10  11  6  7  8

merge:按照指定的列把数据按照一定的方式合并到一起

在这里插入图片描述

In [50]: t
Out[50]:
   W  X   Y   Z
A  0  1   2   3
B  4  5   6   7
C  8  9  10  11

In [51]: t1
Out[51]:
   J  K  L
A  0  1  2
B  3  4  5
C  6  7  8

In [52]: t.merge(t1,left_on="W",right_on="J")
Out[52]:
   W  X  Y  Z  J  K  L
0  0  1  2  3  0  1  2

数据分组聚合

在这里插入图片描述

grouped=df.groupby(by="columns_name")

grouped是一个DataFrameGroupBy对象，是可迭代的

grouped中的每一个元素是一个元组

元组里面是（索引（分组的值），分组之后的DataFrame)

在这里插入图片描述

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

file_path = "./starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_columns', None)
# print(df.head())
#print(df.info())
# grouped=df.groupby(by="Country")
# print(grouped)

# DataFrameGroupBy
# 可以进行遍历
# for i,j in grouped:
# print(i)
# print("*"*100)
# print(j,type(j))
# print("*"*100)
# df[df["Country"]=="US"]
# 调用聚合方法
# country_count=grouped["Brand"].count()
# print(country_count["US"])
# print(country_count["CN"])

# 统计中国每个省份店铺的数量
#china_data = df[df["Country"] == "CN"]
#grouped=china_data.groupby(by="State/Province").count()["Brand"]
grouped=df[df["Country"] == "CN"].groupby(by=[df["Country"],df["State/Province"]])["Brand"].count()
print(grouped)
print(type(grouped))

#数据按照多个条件进行分组
#t1=df[["Country"]].groupby(by=[df["Country"],df["State/Province"]]).count()
#print(t1)
#print(type(t1))
#t2=df.groupby(by=[df["Country"],df["State/Province"]])[["Country"]].count()
#print(t2)
#print(type(t2))

在这里插入图片描述

索引和复合索引

简单的索引操作：

获取index:df.index

指定Index:df.index=[‘x’,‘y’]

重新设置index:df.reindex(list”abcdef”))

指定某一列作为Index:df.set_index(“Country”,drop=False)

返回index的唯一值:df.set_index(“Country”).index.unique()

In [5]: df1 = pd.DataFrame(np.arange(8).reshape((2, 4)), index=list("ab"),columns = list("abcd"))

In [6]: df1
Out[6]:
   a  b  c  d
a  0  1  2  3
b  4  5  6  7
In [7]: df1.reindex(["a","f"])
Out[7]:
     a    b    c    d
a  0.0  1.0  2.0  3.0
f  NaN  NaN  NaN  NaN
In [9]: df1.reindex(["a","f"])
Out[9]:
     a    b    c    d
a  0.0  1.0  2.0  3.0
f  NaN  NaN  NaN  NaN

In [10]: df1.set_index("b")
Out[10]:
   a  c  d
b
1  0  2  3
5  4  6  7

In [11]: df1.set_index("c").index
Out[11]: Int64Index([2, 6], dtype='int64', name='c')
In [12]: df1.set_index("a",drop=False)
Out[12]:
   a  b  c  d
a
0  0  1  2  3
4  4  5  6  7
In [13]: df1["c"].unique()
Out[13]: array([2, 6], dtype=int64)

In [14]:  df1["d"].unique()
Out[14]: array([3, 7], dtype=int64)
In [15]: df1.set_index(["a","c"])
Out[15]:
     b  d
a c
0 2  1  3
4 6  5  7
In [16]df1.set_index(["a","c"]).index
Out[16]:
MultiIndex([(0, 2),
            (4, 6)],
           names=['a', 'c'])
In [17]:  df1.set_index(["a","c","d"],drop=False).index
Out[17]:
MultiIndex([(0, 2, 3),
            (4, 6, 7)],
           names=['a', 'c', 'd'])

In [18]:  df1.set_index(["a","c","d"],drop=False)
Out[18]:
       a  b  c  d
a c d
0 2 3  0  1  2  3
4 6 7  4  5  6  7

In [12]: a=pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':list("hjklm
    ...: no")})

In [13]: a
Out[13]:
   a  b    c  d
0  0  7  one  h
1  1  6  one  j
2  2  5  one  k
3  3  4  two  l
4  4  3  two  m
5  5  2  two  n
6  6  1  two  o

In [14]: a.set_index(["c","d"])
Out[14]:
       a  b
c   d
one h  0  7
    j  1  6
    k  2  5
two l  3  4
    m  4  3
    n  5  2
    o  6  1

Series复合索引

In [13]: a
Out[13]:
   a  b    c  d
0  0  7  one  h
1  1  6  one  j
2  2  5  one  k
3  3  4  two  l
4  4  3  two  m
5  5  2  two  n
6  6  1  two  o

In [14]: a.set_index(["c","d"])
Out[14]:
       a  b
c   d
one h  0  7
    j  1  6
    k  2  5
two l  3  4
    m  4  3
    n  5  2
    o  6  1

In [15]: b= a.set_index(["c","d"])

In [16]: b
Out[16]:
       a  b
c   d
one h  0  7
    j  1  6
    k  2  5
two l  3  4
    m  4  3
    n  5  2
    o  6  1

In [17]: c=b["a"]

In [18]: c
Out[18]:
c    d
one  h    0
     j    1
     k    2
two  l    3
     m    4
     n    5
     o    6
Name: a, dtype: int64

In [19]: type(c)
Out[19]: pandas.core.series.Series

In [20]: c["one"]["j"]
Out[20]: 1

In [21]:  c["one"]
Out[21]:
d
h    0
j    1
k    2
Name: a, dtype: int64

In [22]: b.loc["one"].loc["h"]
Out[22]:
a    0
b    7
Name: h, dtype: int64

In [23]: x=a.set_index(["c","d"])["a"]

In [24]: x
Out[24]:
c    d
one  h    0
     j    1
     k    2
two  l    3
     m    4
     n    5
     o    6
Name: a, dtype: int64

In [25]: type(x)
Out[25]: pandas.core.series.Series

练习

在这里插入图片描述

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = "./starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_columns', None)
#使用matplotlib呈现出店铺总数排名前10的国家
#数据准备
data1=df.groupby(by="Country").count()["Brand"].sort_values(ascending=False)[:10]
_x=data1.index
#print(range(len(_x)))
_y=data1.values
#画图
plt.figure(figsize=(20,10),dpi=100)
plt.bar(range(len(_x)),_y,color="orange")
#x轴设置
plt.xticks(range(len(_x)),_x)
#展示
plt.show()

在这里插入图片描述

import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
plt.rcParams['font.sans-serif'] = ['KaiTi']
mpl.rcParams["axes.unicode_minus"]=False
file_path = "./starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_columns', None)
#使用matplotlib呈现中国每个城市的店铺数量
china_data=df[df["Country"]=="CN"]
#print(china_data.head())
Data=china_data.groupby(by="City").count()["Brand"].sort_values(ascending=False)[:15]
_x=Data.index
print(_x)
_y=Data.values
#画图
plt.figure(figsize=(20,10),dpi=100)
plt.bar(range(len(_x)),_y,width=0.3,color="orange")
#x轴
plt.xticks(range(len(_x)),_x)
#展示
plt.show()

在这里插入图片描述

import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
plt.rcParams['font.sans-serif'] = ['KaiTi']
mpl.rcParams["axes.unicode_minus"]=False
file_path = "./starbucks_store_worldwide.csv"
df = pd.read_csv(file_path)
pd.set_option('display.max_columns', None)
#使用matplotlib呈现中国每个城市的店铺数量
china_data=df[df["Country"]=="CN"]
#print(china_data.head())
Data=china_data.groupby(by="City").count()["Brand"].sort_values(ascending=False)[:20]
_x=Data.index
#print(_x)
_y=Data.values
#画图
plt.figure(figsize=(20,10),dpi=100)
plt.barh(range(len(_x)),_y,height=0.3,color="orange")
#x轴
plt.yticks(range(len(_x)),_x)
#展示
plt.show()

在这里插入图片描述

from matplotlib import pyplot as plt
import pandas as pd
import matplotlib as mlp
plt.rcParams['font.sans-serif']=['KaiTi']
plt.rcParams["axes.unicode_minus"]=False
file_path="./books.csv"
df=pd.read_csv(file_path)
pd.set_option('display.max_columns',None)
#用matplotlib展示不同年份书的数量
#print(df.head(1))
#print(df.info())
data1=df[pd.notnull(df["original_publication_year"])]
data2=data1.groupby(by="original_publication_year").count()["books_count"].sort_values(ascending=False)
#print(data2)
_x=data2.index[:20]
_y=data2.values[:20]
plt.figure(figsize=(20,10),dpi=100)
plt.barh(range(len(_x)),_y,height=0.3,color="orange")
plt.yticks(range(len(_x)),_x)
plt.show()

在这里插入图片描述

from matplotlib import pyplot as plt
import pandas as pd
import matplotlib as mlp
plt.rcParams['font.sans-serif']=['KaiTi']
plt.rcParams["axes.unicode_minus"]=False
file_path="./books.csv"
df=pd.read_csv(file_path)
pd.set_option('display.max_columns',None)
#用matplotlib展示不同年份书的评分平均情况
#去除这一列中NAN的值
data1=df[pd.notnull(df["original_publication_year"])]
data2=data1["average_rating"].groupby(by=data1["original_publication_year"]).mean()
#print(data2)
_x=data2.index
_y=data2.values
plt.figure(figsize=(20,10),dpi=100)
plt.plot(range(len(_x)),_y,color="red")
plt.xticks(list(range(len(_x)))[::10],_x[::10],rotation=45,)
plt.show()

在这里插入图片描述

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib as mlp
mlp.rcParams['font.sans-serif']=['KaiTi']
mlp.rcParams["axes.unicode_minus"]=False
pd.set_option('display.max_columns',None)
file_path="./911.csv"
df=pd.read_csv(file_path)
#print(df.head(2))
#print(df.info())
#获取分类
temp_list=df["title"].str.split(": ").tolist()
cata_list=list(set([i[0] for i in temp_list]))#set()函数创建一个无序不重复元素集，可进行关系测试，删除重复数据，还可以计算交集、差集、并集等
#print(cata_list)
print(df.head(5))
#构造一个全为0的数组
zeros_df=pd.DataFrame(np.zeros((df.shape[0],len(cata_list))),columns=cata_list)
#赋值
for cate in cata_list:
    zeros_df[cate][df["title"].str.contains(cate)]=1
#print(zeros_df)
sum_one=zeros_df.sum(axis=0)
print(sum_one)

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib as mlp
mlp.rcParams['font.sans-serif']=['KaiTi']
mlp.rcParams["axes.unicode_minus"]=False
pd.set_option('display.max_columns',None)
file_path="./911.csv"
df=pd.read_csv(file_path)
#print(df.head(2))
#print(df.info())
#获取分类
temp_list=df["title"].str.split(": ").tolist()
cata_list=[i[0] for i in temp_list]#set()函数创建一个无序不重复元素集，可进行关系测试，删除重复数据，还可以计算交集、差集、并集等
#print(cata_list)
df["cate"]=pd.DataFrame(np.array(cata_list).reshape((df.shape[0],1)))
Data=df.groupby(by="cate").count()["title"]
print(Data)

cate
EMS        320326
Fire        96177
Traffic    223395
Name: title, dtype: int64

原文链接：https://blog.csdn.net/Blood_dunk/article/details/112553310

数据科学包-Day5

字符串的离散化

数据合并

数据分组聚合

索引和复合索引

Series复合索引

练习

你可能也喜欢