Package Pandas
Necessary Module
import pandas as pd
print('Pandas version ' + pd.__version__)
Create Data
df = pd.DataFrame(\
data = [('Bob',968), ('Jessica',155)], \
columns = ['Names', 'Births']\
)
pd.date_range(start='1/1/2009', \
end='12/31/2012', freq='W-MON')
df.to_csv('d.csv', index=False, header=False)
df.to_excel('d.xlsx', index=False)
Get Data
# read from file
df = pd.read_csv(r'C:\d.csv', header=None)
df = pd.read_csv(r'C:\d.csv', names=["name","birth])
df = pd.read_excel(r'd.xlsx', 0, index_col=None)
# reset the column names
df.columns = ["name","Births"]
# get the index list
df.index
# create a new column if it not exists
# reset the whole value if it exists
df['firstAlpha'] = ''
df['name'] # single column
df.name # single column
df[0:1] # single row
df[0:1]['name'] # row 0 and column 'name'
Prepare Data
df.dtypes
df['name'].dtype
df['Names'].unique()
df['Names'].describe()
df.groupby('Names')
df['Names'].apply(func)
df['Names'].apply(lambda x: x.upper())
Analyze Data
df.head(3)
df.tail(3)
df.sort_values(['Births'], ascending=False)
df['Births'].max()
df['Births'].min()
df['Births'].mean()
df['Births'].median()
df['name'][df['Births'] == df['Births'].max()]
df['name'].str.extract('(^\S)', expand=False)
df.groupby('Names').sum()
df.groupby('Names').size()
df.groupby('Names').sum().std()
df.groupby('Names').sum().mean()
Present Data (Refer to matlibplot)
df['Births'].plot()
df['Births'].plot.bar()