Pokemon 种类与属性值的数据探索

1 简介

今天在 kaggle 上看到了一个比较有意思的 Dataset,收录了手游 Pokemon 中 721 只神奇宝贝的基本数据,包括 id,名字(name),类别(type1),二级分类(type2),基本属性(血量:HP, 攻击力:Attack, 防御力:Defense, 魔攻:Special Attack, 魔防:Special Defense, 速度:Speed)。

现参考 kaggle 上的一些文章,做 pokemon 类别对其基本属性影响的分析探究。

2 Pokemon 基本数据概要

1
2
3
4
5
6
7
8
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data_df = pd.read_csv("http://airing.ursb.me/data/Pokemon.csv")
data_df.head()

1
2
3
# 最后两列没有意义,直接删去
data_df = data_df.drop(['Generation', 'Legendary'], 1)
data_df.describe()

1
2
3
# 先看看 HP 与 Attack 之间的关联
sns.jointplot(x="HP", y="Attack", data=data_df);
plt.show()

png

1
2
3
# 首先看看各 Pokemon 的数量分布
sns.boxplot(y="Total", data=data_df)
plt.show()

png

1
2
3
4
# id 和 Total 对属性研究无意义,删去
data_df_2 = data_df.drop(['#', 'Total'], 1)
sns.boxplot(data=data_df_2)
plt.show()

png

1
2
3
var_int = data_df_2.dtypes[data_df.dtypes=='int64'].index
var_int = var_int[1:]
var_int
Index(['Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'], dtype='object')
1
2
3
4
5
6
7
8
l_int = len(var_int)
fig = plt.figure(figsize=(13, 8))
for i, val in enumerate(var_int):
fig.add_subplot(3, 3, i+1)
plt.hist(data_df_2[val], bins=50)
plt.title(val)
plt.show()

png

1
2
# 再看看各属性间的相关性
data_df_2.corr()

3 探索 Pokemon 类别对其属性的影响

1
2
3
4
5
6
# 统计 Pokemon 的类别
type1 = data_df['Type 1'].unique()
print(type1)
data_type1 = data_df.groupby('Type 1').count()['#']
data_type1.sort_values(ascending=False)
['Grass' 'Fire' 'Water' 'Bug' 'Normal' 'Poison' 'Electric' 'Ground' 'Fairy'
 'Fighting' 'Psychic' 'Rock' 'Ghost' 'Ice' 'Dragon' 'Dark' 'Steel' 'Flying']





Type 1
Water       112
Normal       98
Grass        70
Bug          69
Psychic      57
Fire         52
Rock         44
Electric     44
Ground       32
Dragon       32
Ghost        32
Dark         31
Poison       28
Steel        27
Fighting     27
Ice          24
Fairy        17
Flying        4
Name: #, dtype: int64
1
2
3
4
5
6
7
8
9
10
labels = ['Water', 'Normal', 'Grass', 'Bug', 'Psychic', 'Fire', 'Electric', 'Rock', 'Other']
sizes = [112, 98, 70, 69, 57, 52, 44, 44, 175]
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', 'yellow', 'lightgreen', 'silver', 'white', 'pink']
explode = (0, 0, 0, 0, 0, 0, 0, 0, 0.1)
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')
plt.title("Percentage of Different Types of Pokemon")
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 首先通过箱线图观察各类 Pokemon 的数量分布
type_to_int_dict = { 'Grass': 0, 'Fire': 1, 'Water': 2, 'Bug': 3, 'Normal': 4,
'Poison': 5, 'Electric': 6, 'Ground': 7, 'Fairy': 8, 'Fighting': 9,
'Psychic' : 10, 'Rock': 11, 'Ghost':12, 'Ice' : 13, 'Dragon': 14,
'Dark': 15, 'Steel': 16, 'Flying': 17}
data_df['Int_Type1'] = data_df['Type 1'].map(type_to_int_dict).astype(int)
sns.set(style="ticks")
fig, ax = plt.subplots(figsize=(8,6))
sns.boxplot(ax = ax, x="Int_Type1", y="Total", data=data_df, palette="PRGn")
sns.despine(offset=10, trim=True)
plt.show()
# 可以发现龙类的平均数量是远高出其他水平

png

1
2
data_type1 = pd.melt(data_df_2, id_vars=["Name", "Type 1", "Type 2"], var_name="Stat")
data_type1.head()

1
2
3
4
5
plt.figure(figsize=(12,10))
plt.ylim(0, 275)
sns.swarmplot(x="Stat", y="value", data=data_type1, hue="Type 1", split=True, size=7)
plt.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.)
plt.show()

png

1
2
3
4
5
6
7
# 更加直观地通过箱线图观察各类 Pokemon 的属性数值
fig = plt.figure(figsize=(13,24))
for i, col in enumerate(var_int[:6]):
ax1 = fig.add_subplot(6, 1, i + 1)
sns.boxplot(x=data_df['Type 1'], y=data_df_2[col], ax=ax1)
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 可以发现龙类的 Pokemon 攻击力最高,钢铁类的 Pokemon 防御力最强,飞行类的 Pokemon 速度最快。
# 箱线图展示了分位数的位置,小提琴图则展示了任意位置的密度。
# 这里我们再用小提琴图展示上列数据,会更加直观。
# distribution of HP among all types of pokemon
hp_data = data_df[['Name','Type 1','HP']]
hp_data = hp_data.pivot_table(values = 'HP',index = ['Name'], columns = ['Type 1'])
hp_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=hp_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("HP of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
# distribution of Attack among all types of pokemon
attack_data = data_df[['Name','Type 1','Attack']]
attack_data = attack_data.pivot_table(values = 'Attack',index = ['Name'], columns = ['Type 1'])
attack_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=attack_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Attack of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
# distribution of Defense among all types of pokemon
defense_data = data_df[['Name','Type 1','Defense']]
defense_data = defense_data.pivot_table(values = 'Defense',index = ['Name'], columns = ['Type 1'])
defense_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=defense_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Defense of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
# distribution of Sp.Attack among all types of pokemon
sp_attack_data = data_df[['Name','Type 1','Sp. Atk']]
sp_attack_data = sp_attack_data.pivot_table(values = 'Sp. Atk',index = ['Name'], columns = ['Type 1'])
sp_attack_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=sp_attack_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Sp.Attack of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
# distribution of Sp.Defense among all types of pokemon
sp_defense_data = data_df[['Name','Type 1','Sp. Def']]
sp_defense_data = sp_defense_data.pivot_table(values = 'Sp. Def',index = ['Name'], columns = ['Type 1'])
sp_defense_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=sp_defense_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Sp.Defense of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()

png

1
2
3
4
5
6
7
8
9
10
11
# distribution of Speed among all types of pokemon
speed_data = data_df[['Name','Type 1','Speed']]
speed_data = speed_data.pivot_table(values = 'Speed',index = ['Name'], columns = ['Type 1'])
speed_data.head()
f, ax = plt.subplots(figsize=(18, 6))
sns.violinplot(data=speed_data, palette="Set3", bw=.2, cut=1, linewidth=1)
ax.set(ylim=(0, 200))
ax.set_title("Speed of Different Types of Pokemon")
sns.despine(left=True, bottom=True)
plt.show()

png