Home > Développement > Python > Data Set

Data Set

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
df = pd.read_csv('Pokemon.csv')
df.head(n=10)
# Name Type 1 Type 2 Total HP Attack Defense Sp. Atk Sp. Def Speed Generation Legendary
0 1 Bulbasaur Grass Poison 318 45 49 49 65 65 45 1 False
1 2 Ivysaur Grass Poison 405 60 62 63 80 80 60 1 False
2 3 Venusaur Grass Poison 525 80 82 83 100 100 80 1 False
3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 122 120 80 1 False
4 4 Charmander Fire NaN 309 39 52 43 60 50 65 1 False
5 5 Charmeleon Fire NaN 405 58 64 58 80 65 80 1 False
6 6 Charizard Fire Flying 534 78 84 78 109 85 100 1 False
7 6 CharizardMega Charizard X Fire Dragon 634 78 130 111 130 85 100 1 False
8 6 CharizardMega Charizard Y Fire Flying 634 78 104 78 159 115 100 1 False
9 7 Squirtle Water NaN 314 44 48 65 50 64 43 1 False
df.columns = df.columns.str.upper().str.replace('_',' ') # entête de col en upper 
df.head(n=10)
# NAME TYPE 1 TYPE 2 TOTAL HP ATTACK DEFENSE SP. ATK SP. DEF SPEED GENERATION LEGENDARY
0 1 Bulbasaur Grass Poison 318 45 49 49 65 65 45 1 False
1 2 Ivysaur Grass Poison 405 60 62 63 80 80 60 1 False
2 3 Venusaur Grass Poison 525 80 82 83 100 100 80 1 False
3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 122 120 80 1 False
4 4 Charmander Fire NaN 309 39 52 43 60 50 65 1 False
5 5 Charmeleon Fire NaN 405 58 64 58 80 65 80 1 False
6 6 Charizard Fire Flying 534 78 84 78 109 85 100 1 False
7 6 CharizardMega Charizard X Fire Dragon 634 78 130 111 130 85 100 1 False
8 6 CharizardMega Charizard Y Fire Flying 634 78 104 78 159 115 100 1 False
9 7 Squirtle Water NaN 314 44 48 65 50 64 43 1 False
df = df.set_index('NAME') # col NAME devient l'index
df.head(n=10)
# TYPE 1 TYPE 2 TOTAL HP ATTACK DEFENSE SP. ATK SP. DEF SPEED GENERATION LEGENDARY
NAME
Bulbasaur 1 Grass Poison 318 45 49 49 65 65 45 1 False
Ivysaur 2 Grass Poison 405 60 62 63 80 80 60 1 False
Venusaur 3 Grass Poison 525 80 82 83 100 100 80 1 False
VenusaurMega Venusaur 3 Grass Poison 625 80 100 123 122 120 80 1 False
Charmander 4 Fire NaN 309 39 52 43 60 50 65 1 False
Charmeleon 5 Fire NaN 405 58 64 58 80 65 80 1 False
Charizard 6 Fire Flying 534 78 84 78 109 85 100 1 False
CharizardMega Charizard X 6 Fire Dragon 634 78 130 111 130 85 100 1 False
CharizardMega Charizard Y 6 Fire Flying 634 78 104 78 159 115 100 1 False
Squirtle 7 Water NaN 314 44 48 65 50 64 43 1 False
df.index = df.index.str.replace(".*(?=Mega)","")
df.head(n=10)
# TYPE 1 TYPE 2 TOTAL HP ATTACK DEFENSE SP. ATK SP. DEF SPEED GENERATION LEGENDARY
NAME
Bulbasaur 1 Grass Poison 318 45 49 49 65 65 45 1 False
Ivysaur 2 Grass Poison 405 60 62 63 80 80 60 1 False
Venusaur 3 Grass Poison 525 80 82 83 100 100 80 1 False
Mega Venusaur 3 Grass Poison 625 80 100 123 122 120 80 1 False
Charmander 4 Fire NaN 309 39 52 43 60 50 65 1 False
Charmeleon 5 Fire NaN 405 58 64 58 80 65 80 1 False
Charizard 6 Fire Flying 534 78 84 78 109 85 100 1 False
Mega Charizard X 6 Fire Dragon 634 78 130 111 130 85 100 1 False
Mega Charizard Y 6 Fire Flying 634 78 104 78 159 115 100 1 False
Squirtle 7 Water NaN 314 44 48 65 50 64 43 1 False
df = df.drop(['#'], axis=1)
df.head(n=10)
TYPE 1 TYPE 2 TOTAL HP ATTACK DEFENSE SP. ATK SP. DEF SPEED GENERATION LEGENDARY
NAME
Bulbasaur Grass Poison 318 45 49 49 65 65 45 1 False
Ivysaur Grass Poison 405 60 62 63 80 80 60 1 False
Venusaur Grass Poison 525 80 82 83 100 100 80 1 False
Mega Venusaur Grass Poison 625 80 100 123 122 120 80 1 False
Charmander Fire NaN 309 39 52 43 60 50 65 1 False
Charmeleon Fire NaN 405 58 64 58 80 65 80 1 False
Charizard Fire Flying 534 78 84 78 109 85 100 1 False
Mega Charizard X Fire Dragon 634 78 130 111 130 85 100 1 False
Mega Charizard Y Fire Flying 634 78 104 78 159 115 100 1 False
Squirtle Water NaN 314 44 48 65 50 64 43 1 False
df.columns # entête
Index(['TYPE 1', 'TYPE 2', 'TOTAL', 'HP', 'ATTACK', 'DEFENSE', 'SP. ATK',
       'SP. DEF', 'SPEED', 'GENERATION', 'LEGENDARY'],
      dtype='object')
df.shape # 800 lignes, 11 colonnes
(800, 11)
df['TYPE 2'].fillna(df['TYPE 1'], inplace=True)
df.head(n=10)
TYPE 1 TYPE 2 TOTAL HP ATTACK DEFENSE SP. ATK SP. DEF SPEED GENERATION LEGENDARY
NAME
Bulbasaur Grass Poison 318 45 49 49 65 65 45 1 False
Ivysaur Grass Poison 405 60 62 63 80 80 60 1 False
Venusaur Grass Poison 525 80 82 83 100 100 80 1 False
Mega Venusaur Grass Poison 625 80 100 123 122 120 80 1 False
Charmander Fire Fire 309 39 52 43 60 50 65 1 False
Charmeleon Fire Fire 405 58 64 58 80 65 80 1 False
Charizard Fire Flying 534 78 84 78 109 85 100 1 False
Mega Charizard X Fire Dragon 634 78 130 111 130 85 100 1 False
Mega Charizard Y Fire Flying 634 78 104 78 159 115 100 1 False
Squirtle Water Water 314 44 48 65 50 64 43 1 False
print('MAX HP', df['HP'].argmax()) # index -> la col NAME est maintenant l'index
MAX HP Blissey
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: FutureWarning: 'argmax' is deprecated, use 'idxmax' instead. The behavior of 'argmax'
will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  """Entry point for launching an IPython kernel.
print('MAX DEFENSE', df['DEFENSE'].idxmax()) # index -> la col NAME est maintenant l'index
MAX DEFENSE Mega Steelix
df.sort_values('TOTAL', ascending=False).head() # tri le tableau avec le total le plus élevé
TYPE 1 TYPE 2 TOTAL HP ATTACK DEFENSE SP. ATK SP. DEF SPEED GENERATION LEGENDARY
NAME
Mega Rayquaza Dragon Flying 780 105 180 100 180 100 115 3 True
Mega Mewtwo Y Psychic Psychic 780 106 150 70 194 120 140 1 True
Mega Mewtwo X Psychic Fighting 780 106 190 100 154 100 130 1 True
KyogrePrimal Kyogre Water Water 770 100 150 90 180 160 90 3 True
GroudonPrimal Groudon Ground Fire 770 100 180 160 150 90 90 3 True
df['TYPE 1'].unique() # toutes les valeurs TYPE 1
array(['Grass', 'Fire', 'Water', 'Bug', 'Normal', 'Poison', 'Electric',
       'Ground', 'Fairy', 'Fighting', 'Psychic', 'Rock', 'Ghost', 'Ice',
       'Dragon', 'Dark', 'Steel', 'Flying'], dtype=object)
df['TYPE 1'].nunique() # nombre de valeurs TYPE 1
18
df.groupby(['TYPE 1']).size() # nombre de fois de valeurs TYPE 1
TYPE 1
Bug          69
Dark         31
Dragon       32
Electric     44
Fairy        17
Fighting     27
Fire         52
Flying        4
Ghost        32
Grass        70
Ground       32
Ice          24
Normal       98
Poison       28
Psychic      57
Rock         44
Steel        27
Water       112
dtype: int64
df.groupby(['TYPE 1']).size().sort_values(ascending=False)
TYPE 1
Water       112
Normal       98
Grass        70
Bug          69
Psychic      57
Fire         52
Rock         44
Electric     44
Ground       32
Dragon       32
Ghost        32
Dark         31
Poison       28
Steel        27
Fighting     27
Ice          24
Fairy        17
Flying        4
dtype: int64
df.describe()
TOTAL HP ATTACK DEFENSE SP. ATK SP. DEF SPEED GENERATION
count 800.00000 800.000000 800.000000 800.000000 800.000000 800.000000 800.000000 800.00000
mean 435.10250 69.258750 79.001250 73.842500 72.820000 71.902500 68.277500 3.32375
std 119.96304 25.534669 32.457366 31.183501 32.722294 27.828916 29.060474 1.66129
min 180.00000 1.000000 5.000000 5.000000 10.000000 20.000000 5.000000 1.00000
25% 330.00000 50.000000 55.000000 50.000000 49.750000 50.000000 45.000000 2.00000
50% 450.00000 65.000000 75.000000 70.000000 65.000000 70.000000 65.000000 3.00000
75% 515.00000 80.000000 100.000000 90.000000 95.000000 90.000000 90.000000 5.00000
max 780.00000 255.000000 190.000000 230.000000 194.000000 230.000000 180.000000 6.00000
bins = range(0, 200, 20)
plt.hist(df['ATTACK'], bins, histtype="bar", rwidth=1.2, color='#0FF0FF') # les attaques en 0 et 200
plt.xlabel('Attack')
plt.ylabel('Count')
plt.plot()
plt.axvline(df['ATTACK'].mean(), linestyle="dashed", color="yellow") # ligne pointillée = moyenne
plt.show()

png

fire = df[ (df['TYPE 1'] == 'Fire') | (df['TYPE 2'] == 'Fire') ] # tab des TYPE fire
water = df[ (df['TYPE 1'] == 'Water') | (df['TYPE 2'] == 'Water') ]
plt.scatter( fire.ATTACK.head(50), fire.DEFENSE.head(50), color='R', label='Fire', marker='*', s=50) # fire['ATTACK']
plt.scatter( water.ATTACK.head(50), water.DEFENSE.head(50), color='B', label='Water', s=50)
plt.xlabel('Attack') # lire de gauche à droite
plt.ylabel('Defense') # lire du bas vers haut
plt.legend()
plt.plot()
fig = plt.gcf()
fig.set_size_inches(12,6) # Taille d'affichage
plt.show()

png

labels = 'Water', 'Normal', 'Grass', 'Bug', 'Psychic', 'Fire', 'Electric', 'Rock', 'Other'
sizes = [112, 98, 70, 69, 57,52, 44, 44, 175]
colors = ['Y', 'B', '#00ff00', 'C', 'R', 'G', 'silver', 'beige', 'M']
explode = (0, 0, 0.1, 0, 0, 0, 0, 0, 0) # quartier qui ressort
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct="%1.lf%%", shadow=True, startangle=90)
plt.axis('equal')
fig = plt.gcf()
fig.set_size_inches(12,6) # Taille d'affichage
plt.show()

png

plt.subplots(figsize= (15,5))
sns.boxplot( x='TYPE 1', y='ATTACK', data=df )
plt.ylim(0,200)
plt.show()

png

plt.subplots(figsize= (15,5))
sns.violinplot( x='GENERATION', y='TOTAL', data=df )
plt.show()
/usr/local/lib/python3.6/dist-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval

png

plt.figure(figsize=(12,6))
top_types = df['TYPE 1'].value_counts()[:10]
df1 = df[df['TYPE 1'].isin(top_types.index)]
sns.swarmplot( x='TYPE 1', y='TOTAL', data=df1, hue='LEGENDARY' )
plt.axhline( df1['TOTAL'].mean(), color='red', linestyle='dashed' )
plt.show()

png

plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True) # annot -> valeur des corélations
plt.show()

png

a= df.groupby(['GENERATION', 'TYPE 1']).count().reset_index()
a= a[['GENERATION', 'TYPE 1', 'TOTAL']]
a = a.pivot('GENERATION', 'TYPE 1', 'TOTAL')
a[['Water', 'Normal', 'Grass', 'Dragon', 'Fire', 'Electric', 'Rock', 'Flying']].plot(color=['b', 'r', 'g', '#FFA500', 'brown', '#666fff', '#001012','y'], marker='o')
fig=plt.gcf()
fig.set_size_inches(12,6)
plt.show()

png