df = load_dataset()
df.head()
order_id | date | date_of_meal | participants | meal_price | type_of_meal | heroes_adjustment | meal_tip | |
---|---|---|---|---|---|---|---|---|
0 | 839FKFW2LLX4LMBB | 2016-05-27 | 2016-05-31 07:00:00+02:00 | ['David Bishop'] | 469 | Breakfast | False | 93.8 |
1 | 97OX39BGVMHODLJM | 2018-09-27 | 2018-10-01 20:00:00+02:00 | ['David Bishop'] | 22 | Dinner | False | 4.4 |
2 | 041ORQM5OIHTIU6L | 2014-08-24 | 2014-08-23 14:00:00+02:00 | ['Karen Stansell'] | 314 | Lunch | False | 62.8 |
3 | YT796QI18WNGZ7ZJ | 2014-04-12 | 2014-04-07 21:00:00+02:00 | ['Addie Patino'] | 438 | Dinner | False | 87.6 |
4 | 6YLROQT27B6HRF4E | 2015-07-28 | 2015-07-27 14:00:00+02:00 | ['Addie Patino' 'Susan Guerrero'] | 690 | Lunch | False | 138.0 |
def iterrows_original_meal_price(df):
for i, row in df.iterrows():
df.loc[i]["original_meal_price"] = row["meal_price_with_tip"] - row["meal_tip"]
return df
%%timeit -r 1 -n 1
iterrows_original_meal_price(df)
35min 13s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
def apply_original_meal_price(df):
df["original_meal_price"] = df.apply(lambda x: x['meal_price_with_tip'] - x['meal_tip'], axis=1)
return df
%%timeit
apply_original_meal_price(df)
22.5 s ± 170 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
def vectorized_original_meal_price(df):
df["original_meal_price"] = df["meal_price_with_tip"] - df["meal_tip"]
return df
%%timeit
vectorized_original_meal_price(df)
2.46 ms ± 18.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
ones = np.ones(shape=5000)
ones
array([1., 1., 1., ..., 1., 1., 1.])
types = ['object', 'complex128', 'float64', 'int64', 'int32', 'int16', 'int8', 'bool']
df = pd.DataFrame(dict([(t, ones.astype(t)) for t in types]))
df.memory_usage(index=False, deep=True)
object 160000 complex128 80000 float64 40000 int64 40000 int32 20000 int16 10000 int8 5000 bool 5000 dtype: int64
df.memory_usage(deep=True).sum()
478844140
df.memory_usage(deep=True)
Index 8002720 order_id 73024820 date 67022780 date_of_meal 82027880 participants 84977580 meal_price 36012240 type_of_meal 63688760 heroes_adjustment 32076480 meal_tip 32010880 dtype: int64
df = df.astype({'order_id': 'category',
'date': 'category',
'date_of_meal': 'category',
'participants': 'category',
'meal_price': 'int16',
'type_of_meal': 'category',
'heroes_adjustment': 'bool',
'meal_tip': 'float32'})
df.memory_usage(deep=True).sum()
36999962
df.memory_usage(deep=True)
Index 8002720 order_id 8963321 date 2204942 date_of_meal 3942538 participants 5883450 meal_price 2000680 type_of_meal 1000611 heroes_adjustment 1000340 meal_tip 4001360 dtype: int64
def proccess_file(huge_file_path, chunksize = 10 ** 6):
for chunk in pd.read_csv(path, chunksize=chunksize):
process(chunk)
%%timeit
df["meal_price_with_tip"].astype(object).mean()
96 ms ± 499 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
df["meal_price_with_tip"].astype(float).mean()
4.27 ms ± 34.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit
df[df.type_of_meal=="Breakfast"]
103 ms ± 348 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
df.query("type_of_meal=='Breakfast'")
82.4 ms ± 223 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
def foo(N):
accumulator = 0
for i in range(N):
accumulator = accumulator + i
return accumulator
%%timeit
df.meal_price_with_tip.map(foo)
17.9 s ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%cython
def cython_foo(long N):
cdef long accumulator
accumulator = 0
cdef long i
for i in range(N):
accumulator += i
return accumulator
%%timeit
df.meal_price_with_tip.map(cython_foo)
365 ms ± 2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
@jit(nopython=True)
def numba_foo(N):
accumulator = 0
for i in range(N):
accumulator = accumulator + i
return accumulator
%%timeit
df.meal_price_with_tip.map(numba_foo)
414 ms ± 596 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
def load_data():
return np.ones((2 ** 30), dtype=np.uint8)
%%memit
def proccess():
data = load_data()
return another_foo(foo(data))
proccess()
peak memory: 8106.62 MiB, increment: 3042.64 MiB
%%memit
def proccess():
data = load_data()
data = foo(data)
data = another_foo(data)
return data
proccess()
peak memory: 7102.64 MiB, increment: 2038.66 MiB