# Neural Networks
# numpy
nan
l = np.array([float("nan"), np.nan]) # [nan, nan] in type float64 np.isnan(l) # [True, True] l = np.array([float("nan"), np.nan], dtype='object') # [nan, nan] in type object np.isnan(l) # exception thrown saying cannot coerce object to float64 l.astype(bool) # [True, True] non 0 or non False then True np.nan == np.nan # False float("nan") == float("nan") # False
np.concatenate((l1, l2), axis=1)
stacking on existing axis
np.stack(sequence, axis=0)
stacking on a new axis
a = [np.array(['a', 'b']), np.array(['c', 'd'])] # 2x (2,) np.stack(a, axis=0) # [['a', 'b'],['c', 'd']] # 2x2 np.vstack(a) # [['a', 'b'],['c', 'd']] # 2x2
np.expand_dims
create a new axis with dim 1
a = np.array([1, 2]) # (2,) np.expand_dims(x, axis=0) # (1, 2)
np.setdiff1d(l1, l2)
np.percentile
- Formula:
q/100 * (number - 1) 0 index
- Interpolation:
- linear: i + (j - i) * fraction
- midpoint
- lower
- higher
- 0th => minimum
- 100th => maximum
- another formula:
q/100 * (number + 1) 1-index
, only 0th percentile and no 100th percentile (number of others smaller than you)
- Formula:
np.where
conditional assignment
np.where([True, True, False], 'gua', ['a', 'b', 'c']) # => ['a', 'b', 'gua']
index
l = np.array([1,2,3]) l[[True, False, True]] # [1, 3] 0D => 1D r = [0, 1, 2] c = [0, 1, 2] l = np.array([[1,2,3], [4,5,6], [7,8,9]]) l[(r, c)] # 1, 5, 9
np.nanmean
remove nan first then calculate the average
Broadcasting
Rule 1: If the two arrays differ in their number of dimensions, the shape of the one with fewer dimensions is padded with ones on its leading (left) side.
Rule 2: If the shape of the two arrays does not match in any dimension, the array with shape equal to 1 in that dimension is stretched to match the other shape.
Rule 3: If in any dimension the sizes disagree and neither is equal to 1, an error is raised.
a = np.arange(3).reshape((3, 1)) # a.shape = (3, 1) -> (3, 3) b = np.arange(3) # b.shape = (3,) -> (1, 3) -> (3, 3) # error M = np.ones((3, 2)) # M.shape = (3, 2) a = np.arange(3) # a.shape = (3,) -> (1, 3) -> (3, 3)
tolist
np.float(6.4).tolist() # => 6.4 in float np.array([1,2,3,4]).tolist() # => [1,2,3,4]
# pandas
API Documentation
nan
Similar with numpy and more:
l1 = pd.DataFrame([np.nan], dtype='string') # display as <NAN> l1 = pd.DataFrame([None], dtype='string') # display as <NAN> l2 = pd.DataFrame([np.nan], dtype='str') # display as nan l2 = pd.DataFrame([None], dtype='str') # display as None l1.replace({np.nan: None}) # string to obj, and <NAN> => None l1.fillna('2333') # <NAN> is filled l2.fillna('2333') # None and nan are filled df.isnull() # same as isnan df.isnan()
Return types
df[col] # Series df[col].values # np df[col].unique() # np
Assign
TIP
Assignment with
df.loc
:df.loc[mask, cols]
if mask is start:end slice, both start and end are inclusivedf.loc[mask, cols]
=>cols
must exist- if
df.loc[mask, col]
gives series, only series can be assigned and others will getNaN
assigned - if
df.loc[mask, col]
gives dataframe, dataframe with matched column names or series can be assigned and others will getNaN
assigned
Assignment with
[]
:[]
does not support mask- anything can be assigned as long as shape is matched
Assignment with
df.loc
:# df_b.loc => dataframe # df_a's cols should be cols df_b.loc[mask, cols] = df_a # df_b.loc => dataframe # df_a's cols do not matter df_b.loc[mask, cols] = df_a.values # df_b.loc => series # only series can be assigned df_b[mask, col] = df_a.values
Assignment with
[]
:# anything is okay df[col] = df_a.values df[col] = df_a df[cols] = df_a df[cols] = df_a.values df[col_not_exist] = df_a # if value to assign is a scalar df[col] = 'str' df[col] = ('str', 'str') # quick assign not support tuple
Selection
df_loc[mask, df.columns!='col_name']
select all other columnsdf[df.columns[~df.columns.isin(['col_name'])]]
select all other columns
astype
""" Prior to pandas 1.0, object dtype was the only option. This was unfortunate for many reasons: You can accidentally store a mixture of strings and non-strings in an object dtype array. It’s better to have a dedicated dtype. object dtype breaks dtype-specific operations like DataFrame.select_dtypes(). There isn’t a clear way to select just text while excluding non-text but still object-dtype columns. When reading code, the contents of an object dtype array is less clear than 'string'. """ df.astype(str) # => object type df.astype('string') # => string
inplace
- Yes: drop, rename, drop_duplicates, replace, fillna
- No: merge, join
- Problematic: multi-cols,
df.loc[:, [col1, col2]].fillna('', inplace=True)
not working
drop
df.drop([a,list], axis=0, inplace=True) # drop rows, list is indices df.drop([a,list], axis=1, inplace=True) # drop cols, list is columns names
square
df[col]
=> seriesdf[[col1, col2]]
=> depreciated, dataframedf[col1, col2]
=> depreciated, dataframedf[[col]]
=> dataframedf.loc[mask, col]
=> seriesdf.loc[mask, [col]]
=> dataframedf.loc[mask, [col1, col]]
=> dataframegroupby(key_col, as_index=False)[col]
=> DataFrameGroupBy, but works as seriesgroupby(key_col)[col]
=> SeriesGroupBygroupby(key_col)[[col]]
=> DataFrameGroupBy
Combine a column inside groups
# x is a series df = df.groupby(key_col)[col_to_combine].apply(lambda x: join_function(x)).reset_index() # x is a dataframe df = df.groupby(key_col)[[col_to_combine]].apply(lambda x: join_function(x)).reset_index() # x is a series df_golden.pivot_table(index=key_col, values=col_to_combine, aggfunc=lambda x: join_function(x)).reset_index()
apply
TIP
if use dataframe, refer column values by column name
if use series, refer column value by integer
groupby
groupby(key_col, as_index=False)[col]
in,apply
uses series- other dataframes in,
apply
uses dataframe - series in,
apply
uses series sort=True(default)
group order is sorted
df
Understanding axis
axis
is 0, apply along the row, resulting a column is passed in- dataframe or series in,
apply
always uses series apply(axis=1)
=> group columns, pass in a rowapply(axis=0)
=> group rows, pass in a colapply(lambda,result_type=None)
=> (default),only act whenaxis=1
. Determine the return type by passing an empty series. If error, return type is dataframe. If series, return type is dataframe. If returns a list/scalar, final result will be a seriesapply(lambda,result_type=reduce)
=> only act whenaxis=1
, final result will be a series if possible (error or list/scalar)apply(lambda,result_type=expand)
=> only act whenaxis=1
, final result will be a dataframe if possible (error or series or list)apply(lambda,result_type=broadcast)
=> only act whenaxis=1
, final result will be broadcast to a dataframe
def custom_function(x): return x[0] # result_type is None, lambda returns a series [silent error, empty_df passes empty series x, which does not have index 0, thus this series x is returned back] # result is df empty_df.apply(lambda x: custom_function(x), axis=1) """ foo """ # result_type is None, lambda returns a scalar # result is series df.apply(lambda x: custom_function(x), axis=1) """ 0 1 1 1 2 1 dtype: int64 """ # result_type is None, lambda returns a list # result is series df.apply(lambda x: [1, 2], axis=1) """ 0 [1, 2] 1 [1, 2] 2 [1, 2] dtype: object """ # result_type is None, lambda returns a series # result is df df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) """ foo bar 0 1 2 1 1 2 2 1 2 """ # result_type is expand, lambda returns a list # result is df df.apply(lambda x: [1, 2], axis=1, result_type='expand') """ 0 1 0 1 2 1 1 2 2 1 2 """ # result_type is reduce, lambda returns a list # result is series df.apply(lambda x: [1, 2], axis=1, result_type='reduce') """ 0 [1, 2] 1 [1, 2] 2 [1, 2] dtype: object """
- dataframe or series in,
transform
groupby
transform
gets a column passed in and expands the result to the same shape(and same order) as argument; if the passed columns is the group column,x.name
is the group name, otherwise,x.name
is the column nameapply
can have a column, a row or a dataframe passed in and the lambda result is not expanded. Thus it can return variable shape data frame
df
transform
can get both as column and row, but the lambda function must return the results with the same shape as input
filter
df.filter(like='abc') # columns containing abc, case sensitive. df.filter(like='abc', axis=0) # index labels containing abc, case sensitive. df.filter(regex=r'[a-z]') df.filter(items=[])
replace
WARNING
Before
replace
, ensure that dataframe or series values are with valid type# full match then replace df.replace('value_full_match', '123') # to replace None, use dict df.replace({None: '123'}) # if regex, as long as the pattern is found, then replace df.replace('value_contain', '123' regex=True)
merge
vsjoin
merge
can work on columns, whereasjoin
must work on indicesgroupby
column first or function first are all okay
count
: count non-NaN values in a groupnunique
: count unique non-NaN values in a group
pivot_table
create columns from values in rows, given the below dataframe:
yearPassed policyType count 0 1990 1 2000 1 1990 2 1400 2 1990 3 1200 3 1991 3 70 4 1992 2 1000 5 1992 3 800
df = df.pivot_table(index=['yearPassed'], columns=['policyType'], values='count').fillna(0) df # column is a index policyType 1 2 3 yearPassed 1990 2000.0 1400.0 1200.0 1991 0.0 0.0 70.0 1992 0.0 1000.0 800.0 df.columns = ['p1', 'p2', 'p3'] df p1 p2 p3 yearPassed 1990 2000.0 1400.0 1200.0 1991 0.0 0.0 70.0 1992 0.0 1000.0 800.0
pd.melt
Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
a b x 1 3 y 2 4
pd.melt(df.reset_index(), var_name='c2', value_name='c1', id_vars='index').set_index('index') c1 c2 x 1 a x 3 b y 2 a y 4 b
date wrangling
float epoch(unit, seconds) to
pd.Timestamp
pd.to_datetime(df[col], unit='s')
methods from
pd.Timestamp
WARNING
pd.Timestamp
cols' getting value (df[col].values
) will results innp.datetime64
which does not contain timezone info# datetime.weekday() # 0 is monday # 6 is sunday df[timestamp_col].dt.weekday # get datetime.date object # datetime.date(2020, 6, 6) df[timestamp_col].dt.date # timezone conversion df[zone_aware_col] = pd.to_datetime(df[col], unit='s').dt.tz_localize('UTC') # before conversion, pd.Timestamp must have timezone assigned df[zone_aware_col].dt.tz_convert('Asia/Singapore')
Problem solving
Split into small frames
answer_list = [] def solve_problem_with_small_df(small_df): for index_label, row in small_df.iterrows(): pass for index_label, small_df in df.groupby(key_col): solved_df = solve_problem_with_small_df(small_df) answer_list.append(solved_df) pd.concat(answer_list)
example:
df_golden.pivot_table(index='Field ID', values='Pages Gold', aggfunc=lambda x: ','.join(x)).reset_index() df_golden.groupby('Field ID')['Pages Gold'].apply(lambda x: ','.join(x)).reset_index() for idx, page_series in df_golden.groupby('Field ID')['Pages Gold']: l = ','.join(page_series)
Vectorized method: count consecutive positive values
a = [0,1,0,1,1,0,0,0,1,1,0,1,0] b = [1,0,0,0,1,1,0,1,1,1,0,0,0] df = pd.DataFrame(zip(a,b), columns=['a', 'b']) c = lambda y: y * (y.groupby((y != y.shift()).cumsum()).cumcount() + 1) # concat a, b to one column does not affect the algorithm # if 1st item of b is 0, final result is ensured to be 0 # if 1st item of b is 1, this item must be the start of a new group, which is finally 1 c(df.unstack()).unstack().T
IO
- to read a csv with desired type:
pd.read_csv('file_path', dtype=str)
- to save a csv:
pd.to_csv('file_path', sep=',', index=False)
- to read a csv with desired type:
Copy
pd.DataFrame(df)
xdf.copy(deep=True)
yes
Insert
df.insert(idx, column_name, column_value: Union[single, array, series])
Aggregation
df.groupby('col').agg(new_name=('col_name', 'agg_func_name'))
Creation
examples:
pd.DataFrame([1, 2, 3, 4], columns=['a']) # 4 rows, 1 columns pd.DataFrame({'key': 'value', 'key2': 'value2'}.items) # 2 rows, 2 columns pd.DataFrame({'column': [1,2,3,4]}) # 4 rows, 1 column
assign special list to cell and expand cell to column
df = pd.DataFrame([1, 2, 3, 4], columns=['a']) # 4 rows, 1 column df['a'] = df['a'].astype('object') # then it is possible to assign list as cell value df.at[0, 'a'] = [1, 2, 3, 4] # assign list as value, loc leads to error pd.DataFrame(df.loc[0, 'a'].to_dict()) # same as df
fillna
# fill nan in column c with values from column b df['c'] = df['c'].fillna(df['b'])
str
df['string'].str.split(' ').str[-1] # '123 456'.split(' ')[-1]