Pandas
import pandas as pd
Series
s = pd.Series([3, -5, 7, 4], index=['a', 'b', 'c', 'd'])
DataFrame
data = {'Country': ['Belgium', 'India', 'Brazil'],
'Capital': ['Brussels', 'New Delhi', 'Brasilia'],
'Population': [11190846, 1303171035, 207847528]}
df = pd.DataFrame(data,columns=['Country', 'Capital', 'Population'])
I/O
csv
pd.read_csv('file.csv', header=None, nrows=5)
df.to_csv('myDataFrame.csv')
Excel
xlsx = pd.ExcelFile('file.xls')
df = pd.read_excel(xlsx, 'Sheet1')
pd.read_excel('file.xlsx')
df.to_excel('dir/myDataFrame.xlsx', sheet_name='Sheet1')
SQL
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:')
pd.read_sql(SELECT * FROM my_table;, engine)
pd.read_sql_table('my_table', engine)
pd.read_sql_query(SELECT * FROM my_table;', engine)
df.to_sql('myDf', engine)
Selection
By Label/Position
s['b']
# -5
df[1:]
# Country Capital Population
# 1 India New Delhi 1303171035
# 2 Brazil Brasilia 207847528
df.iloc([0], [0])
# 'Belgium'
df.loc([0], ['Country'])
# 'Belgium'
df.ix[2]
# Country Brazil
# Capital Brasilia
# Population 207847528
df.ix[:, 'Capital']
# 0 Brussels
# 1 New Delhi
# 2 Brasilia
df.ix[1, 'Capital']
# 'New Delhi'
Boolean Indexing
s[~(s > 1)]
s[(s < -1) | (s > 2)]
df[df['Population'] > 1200000000]
s['a'] = 6
Dropping
s.drop(['a', 'c']) # Drop values from rows (axis=0)
df.drop('Country', axis=1) # Drop values from columns(axis=1)
df.sort_index() # Sort by labels along an axis
df.sort_values(by='Country') # Sort by the values along an axis
df.rank() # Assign ranks to entries
Information
df.shape # (rows, columns)
df.index # Describe index
df.columns # Describe DataFrame columns
df.info() # Info on DataFrame
df.count() # Number of non-NA values
df.sum() # Sum of values
df.cumsum() # Cumulative sum of values
df.min()/df.max() # Minimum/maximum values
df.idxmin()/df.idxmax() # Minimum/Maximum index value
df.describe() # Summary statistics
df.mean() # Mean of values
df.median() # Median of values
Function
f = lambda x: x*2
df.apply(f) # Apply function
df.applynap(f) # Apply function element-wise
s3 = pd.Series([7, -2, 3], index=['a', 'c', 'd'])
s + s3
# a 10.0
# b NaN
# c 5.0
# d 7.0
s.add(s3, fill_value=0)
# a 10.0
# b -5.0
# c 5.0
# d 7.0
s.sub(s3, fill_value=2)
s.div(s3, fill_value=4)
s.mul(s3, fill_value=3)