Some useful functions to effectively work with pandas
import pandas as pd
import numpy as  np
from nbdev.showdoc import *

Create a DataFrame from any CSV for demo.

df = pd.read_csv('/home/condor/datasets/mlb-games/dataquest-mlb-game-logs/game_logs.csv',low_memory=False)
df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(77), int64(6), object(78)
memory usage: 861.6 MB
df_meminfo(df)
Memory usage for DataFrames types of columns:
			  Average	    Total
int64:  		  1.12 MB	  7.87 MB
object:  		  9.53 MB	752.74 MB
float64:  		  1.29 MB	100.99 MB
Total for DataFrame:			861.60 MB

df_meminfo[source]

df_meminfo(df)

Prints an Average and a Total memory usage for each dtype in df.

mem_usage[source]

mem_usage(pandas_obj)

Returns memory in MB, used by pandas_obj (DataFrame or Series).

print(mem_usage(df['day_of_week']))
9.84 MB
df_int = df.select_dtypes(include=['int'])
converted_int = df_int.apply(pd.to_numeric,downcast='unsigned')
print(mem_usage(df_int))
print(mem_usage(converted_int))
compare_ints = pd.concat([df_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)
7.87 MB
1.48 MB
before after
uint8 NaN 5.0
uint32 NaN 1.0
int64 6.0 NaN
df_fl = df.select_dtypes(include=['float'])
converted_fl = df_fl.apply(pd.to_numeric,downcast='float')
print(mem_usage(df_fl))
print(mem_usage(converted_fl))
compare_fls = pd.concat([df_fl.dtypes,converted_fl.dtypes],axis=1)
compare_fls.columns = ['before','after']
compare_fls.apply(pd.Series.value_counts)
100.99 MB
50.49 MB
before after
float32 NaN 77.0
float64 77.0 NaN