Home > Développement > Python > Pandas

Pandas

import numpy as np
import pandas as pd
df = pd.DataFrame( { 'A': [1,np.nan,np.nan], 'B': [5,np.nan,3], 'C': [6,8,4] } )
df
A B C
0 1.0 5.0 6
1 NaN NaN 8
2 NaN 3.0 4
df.dropna() # explicite axis=0 -> ligne
A B C
0 1.0 5.0 6
df.dropna(axis=1) # colone
C
0 6
1 8
2 4
df.dropna(thresh=2) # drop ligne avec plus de 2 nan
A B C
0 1.0 5.0 6
2 NaN 3.0 4
df.fillna(value='REMPLACE') # remplace nan , affiche mais ne prend pas en compte l'action
A B C
0 1 5 6
1 REMPLACE REMPLACE 8
2 REMPLACE 3 4
df.fillna(value='REMPLACE', inplace=True) # implace true , prend en compte l'action
df
A B C
0 1 5 6
1 REMPLACE REMPLACE 8
2 REMPLACE 3 4
df = pd.DataFrame( { 'A': [1,4,np.nan], 'B': [5,np.nan,3], 'C': [6,8,4] } )
df['A'].fillna(value=df['A'].mean(), inplace=True) # mean -> moyenne de la colonne
df
A B C
0 1.0 5.0 6
1 4.0 NaN 8
2 2.5 3.0 4

Group By

data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
        'Person':['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
       'Sales':[200, 120, 340, 124, 2, 2]}
df = pd.DataFrame(data)
df
Company Person Sales
0 GOOG Sam 200
1 GOOG Charlie 120
2 MSFT Amy 340
3 MSFT Vanessa 124
4 FB Carl 2
5 FB Sarah 2
df.groupby('Company') # regroupement par entreprise
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7f6a94332cf8>
by_comp = df.groupby('Company')
by_comp.mean() # moyenne, ordre alpha
Sales
Company
FB 2
GOOG 160
MSFT 232
by_comp.max()
Person Sales
Company
FB Sarah 2
GOOG Sam 200
MSFT Vanessa 340
by_comp.min()
Person Sales
Company
FB Carl 2
GOOG Charlie 120
MSFT Amy 124
by_comp.describe() # tab avec les opérations par défaut
Sales
count mean std min 25% 50% 75% max
Company
FB 2.0 2.0 0.000000 2.0 2.0 2.0 2.0 2.0
GOOG 2.0 160.0 56.568542 120.0 140.0 160.0 180.0 200.0
MSFT 2.0 232.0 152.735065 124.0 178.0 232.0 286.0 340.0
by_comp.describe().transpose() # tab avec les opérations par défaut, tableau inversé
Company FB GOOG MSFT
Sales count 2.0 2.000000 2.000000
mean 2.0 160.000000 232.000000
std 0.0 56.568542 152.735065
min 2.0 120.000000 124.000000
25% 2.0 140.000000 178.000000
50% 2.0 160.000000 232.000000
75% 2.0 180.000000 286.000000
max 2.0 200.000000 340.000000
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])
df1
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7])
df2
A B C D
4 A4 B4 C4 D4
5 A5 B5 C5 D5
6 A6 B6 C6 D6
7 A7 B7 C7 D7
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])
df3
A B C D
8 A8 B8 C8 D8
9 A9 B9 C9 D9
10 A10 B10 C10 D10
11 A11 B11 C11 D11
pd.concat([df1,df2,df3])
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
2 A2 B2 C2 D2
3 A3 B3 C3 D3
4 A4 B4 C4 D4
5 A5 B5 C5 D5
6 A6 B6 C6 D6
7 A7 B7 C7 D7
8 A8 B8 C8 D8
9 A9 B9 C9 D9
10 A10 B10 C10 D10
11 A11 B11 C11 D11
pd.concat([df1,df2,df3], axis=1)
A B C D A B C D A B C D
0 A0 B0 C0 D0 NaN NaN NaN NaN NaN NaN NaN NaN
1 A1 B1 C1 D1 NaN NaN NaN NaN NaN NaN NaN NaN
2 A2 B2 C2 D2 NaN NaN NaN NaN NaN NaN NaN NaN
3 A3 B3 C3 D3 NaN NaN NaN NaN NaN NaN NaN NaN
4 NaN NaN NaN NaN A4 B4 C4 D4 NaN NaN NaN NaN
5 NaN NaN NaN NaN A5 B5 C5 D5 NaN NaN NaN NaN
6 NaN NaN NaN NaN A6 B6 C6 D6 NaN NaN NaN NaN
7 NaN NaN NaN NaN A7 B7 C7 D7 NaN NaN NaN NaN
8 NaN NaN NaN NaN NaN NaN NaN NaN A8 B8 C8 D8
9 NaN NaN NaN NaN NaN NaN NaN NaN A9 B9 C9 D9
10 NaN NaN NaN NaN NaN NaN NaN NaN A10 B10 C10 D10
11 NaN NaN NaN NaN NaN NaN NaN NaN A11 B11 C11 D11
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
left
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})
right
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
pd.merge(left, right, how='inner', on='key') # Inner join on key
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])
left.join(right)
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
left.join(right, how='outer')
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3

Opérations

df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df
col1 col2 col3
0 1 444 abc
1 2 555 def
2 3 666 ghi
3 4 444 xyz
df.head(2) # afficher les 2 premières lignes
col1 col2 col3
0 1 444 abc
1 2 555 def
df['col2'].unique()
array([444, 555, 666])
df['col2'].nunique() # return index
3
df['col2'].value_counts()
444 2
555 1
666 1
Name: col2, dtype: int64
newdf = df[ (df['col1'] > 2) & (df['col2'] == 444) ] # si value col1 > 2 & value col2 == 444
newdf
col1 col2 col3
3 4 444 xyz
def fois2(x):
    return x*2
df['col1'].apply(fois2)
0 2
1 4
2 6
3 8
Name: col1, dtype: int64
df['col3'].apply(len)
0 3
1 3
2 3
3 3
Name: col3, dtype: int64
df['col1'].sum() # sum col1
10
del df['col3'] # delete col3
df
col1 col2
0 1 444
1 2 555
2 3 666
3 4 444
df.sort_values(by='col2') # sort col2
col1 col2
0 1 444
3 4 444
1 2 555
2 3 666
df.isnull() # check is value is null, return bool
col1 col2
0 False False
1 False False
2 False False
3 False False

CSV

df = pd.read_csv('exemple')
---------------------------------------------------------------------------

FileNotFoundError                         Traceback (most recent call last)

<ipython-input-88-625ec38768ef> in <module>
----> 1 df = pd.read_csv('exemple')


/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)
    676                     skip_blank_lines=skip_blank_lines)
    677 
--> 678         return _read(filepath_or_buffer, kwds)
    679 
    680     parser_f.__name__ = name


/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    438 
    439     # Create the parser.
--> 440     parser = TextFileReader(filepath_or_buffer, **kwds)
    441 
    442     if chunksize or iterator:


/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    785             self.options['has_index_names'] = kwds['has_index_names']
    786 
--> 787         self._make_engine(self.engine)
    788 
    789     def close(self):


/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1012     def _make_engine(self, engine='c'):
   1013         if engine == 'c':
-> 1014             self._engine = CParserWrapper(self.f, **self.options)
   1015         else:
   1016             if engine == 'python':


/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1706         kwds['usecols'] = self.usecols
   1707 
-> 1708         self._reader = parsers.TextReader(src, **kwds)
   1709 
   1710         passed_names = self.names is None


pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()


pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()


FileNotFoundError: File b'exemple' does not exist
df.to_csv('toto', index=False)