# Neural Networks

# numpy

  1. nan

    l = np.array([float("nan"), np.nan]) # [nan, nan] in type float64
    np.isnan(l) # [True, True]
    
    l = np.array([float("nan"), np.nan], dtype='object') # [nan, nan] in type object
    np.isnan(l) # exception thrown saying cannot coerce object to float64
    
    l.astype(bool) # [True, True] non 0 or non False then True
    
    np.nan == np.nan # False
    float("nan") == float("nan") # False
    
  2. np.concatenate((l1, l2), axis=1)

    stacking on existing axis

  3. np.stack(sequence, axis=0)

    stacking on a new axis

    a = [np.array(['a', 'b']), np.array(['c', 'd'])] # 2x (2,)
    np.stack(a, axis=0) # [['a', 'b'],['c', 'd']] # 2x2
    np.vstack(a) # [['a', 'b'],['c', 'd']] # 2x2
    
  4. np.expand_dims

    create a new axis with dim 1

    a = np.array([1, 2]) # (2,)
    np.expand_dims(x, axis=0) # (1, 2)
    
  5. np.setdiff1d(l1, l2)

  6. np.percentile

    • Formula: q/100 * (number - 1) 0 index
    • Interpolation:
      • linear: i + (j - i) * fraction
      • midpoint
      • lower
      • higher
    • 0th => minimum
    • 100th => maximum
    • another formula: q/100 * (number + 1) 1-index, only 0th percentile and no 100th percentile (number of others smaller than you)
  7. np.where

    conditional assignment

    np.where([True, True, False], 'gua', ['a', 'b', 'c'])  # => ['a', 'b', 'gua']
    
    
  8. index

    l = np.array([1,2,3])
    l[[True, False, True]]  # [1, 3] 0D => 1D
    
    r = [0, 1, 2]
    c = [0, 1, 2]
    l = np.array([[1,2,3], [4,5,6], [7,8,9]])
    l[(r, c)]  # 1, 5, 9
    
  9. np.nanmean

    remove nan first then calculate the average

  10. Broadcasting

    • Rule 1: If the two arrays differ in their number of dimensions, the shape of the one with fewer dimensions is padded with ones on its leading (left) side.

    • Rule 2: If the shape of the two arrays does not match in any dimension, the array with shape equal to 1 in that dimension is stretched to match the other shape.

    • Rule 3: If in any dimension the sizes disagree and neither is equal to 1, an error is raised.

      a = np.arange(3).reshape((3, 1))  # a.shape = (3, 1) -> (3, 3)
      b = np.arange(3)  # b.shape = (3,) -> (1, 3) -> (3, 3)
      
      # error
      M = np.ones((3, 2))  # M.shape = (3, 2)
      a = np.arange(3)  # a.shape = (3,) -> (1, 3) -> (3, 3)
      
  11. tolist

    np.float(6.4).tolist() # => 6.4 in float
    np.array([1,2,3,4]).tolist()  # => [1,2,3,4]
    

# pandas

API Documentation

pandas official api doc

  1. nan

    Similar with numpy and more:

    l1 = pd.DataFrame([np.nan], dtype='string')  # display as <NAN>
    l1 = pd.DataFrame([None], dtype='string')  # display as <NAN>
    l2 = pd.DataFrame([np.nan], dtype='str')  # display as nan
    l2 = pd.DataFrame([None], dtype='str')  # display as None
    l1.replace({np.nan: None})  # string to obj, and <NAN> => None
    
    l1.fillna('2333')  # <NAN> is filled
    l2.fillna('2333')  # None and nan are filled
    
    df.isnull() # same as isnan
    df.isnan()
    
  2. Return types

    df[col] # Series
    df[col].values # np
    df[col].unique() # np
    
  3. Assign

    TIP

    Assignment with df.loc:

    • df.loc[mask, cols] if mask is start:end slice, both start and end are inclusive
    • df.loc[mask, cols] => cols must exist
    • if df.loc[mask, col] gives series, only series can be assigned and others will get NaN assigned
    • if df.loc[mask, col] gives dataframe, dataframe with matched column names or series can be assigned and others will get NaN assigned

    Assignment with []:

    • [] does not support mask
    • anything can be assigned as long as shape is matched

    Assignment with df.loc:

    # df_b.loc => dataframe
    # df_a's cols should be cols
    df_b.loc[mask, cols] = df_a
    
    # df_b.loc => dataframe
    # df_a's cols do not matter
    df_b.loc[mask, cols] = df_a.values
    
    # df_b.loc => series
    # only series can be assigned
    df_b[mask, col] = df_a.values
    

    Assignment with []:

    # anything is okay
    df[col] = df_a.values
    df[col] = df_a
    df[cols] = df_a
    df[cols] = df_a.values
    df[col_not_exist] = df_a
    
    # if value to assign is a scalar
    df[col] = 'str'
    df[col] = ('str', 'str') # quick assign not support tuple
    

    Selection

    • df_loc[mask, df.columns!='col_name'] select all other columns
    • df[df.columns[~df.columns.isin(['col_name'])]] select all other columns
  4. astype

    """
    Prior to pandas 1.0, object dtype was the only option. This was unfortunate for many reasons:
    
        You can accidentally store a mixture of strings and non-strings in an object dtype array. It’s better to have a dedicated dtype.
    
        object dtype breaks dtype-specific operations like DataFrame.select_dtypes(). There isn’t a clear way to select just text while excluding non-text but still object-dtype columns.
    
        When reading code, the contents of an object dtype array is less clear than 'string'.
    """
    df.astype(str)  #  => object type
    df.astype('string')  # => string
    
  5. inplace

    • Yes: drop, rename, drop_duplicates, replace, fillna
    • No: merge, join
    • Problematic: multi-cols, df.loc[:, [col1, col2]].fillna('', inplace=True) not working
  6. drop

    df.drop([a,list], axis=0, inplace=True) # drop rows, list is indices
    df.drop([a,list], axis=1, inplace=True) # drop cols, list is columns names
    
  7. square

    • df[col] => series
    • df[[col1, col2]] => depreciated, dataframe
    • df[col1, col2] => depreciated, dataframe
    • df[[col]] => dataframe
    • df.loc[mask, col] => series
    • df.loc[mask, [col]] => dataframe
    • df.loc[mask, [col1, col]] => dataframe
    • groupby(key_col, as_index=False)[col] => DataFrameGroupBy, but works as series
    • groupby(key_col)[col] => SeriesGroupBy
    • groupby(key_col)[[col]] => DataFrameGroupBy
  8. Combine a column inside groups

    # x is a series
    df = df.groupby(key_col)[col_to_combine].apply(lambda x: join_function(x)).reset_index()
    
    # x is a dataframe
    df = df.groupby(key_col)[[col_to_combine]].apply(lambda x: join_function(x)).reset_index()
    
    # x is a series
    df_golden.pivot_table(index=key_col, values=col_to_combine, aggfunc=lambda x: join_function(x)).reset_index()
    
  9. apply

    TIP

    if use dataframe, refer column values by column name

    if use series, refer column value by integer

    • groupby

      • groupby(key_col, as_index=False)[col] in, apply uses series
      • other dataframes in, apply uses dataframe
      • series in, apply uses series
      • sort=True(default) group order is sorted
    • df

      Understanding axis

      axis is 0, apply along the row, resulting a column is passed in

      • dataframe or series in, apply always uses series
      • apply(axis=1) => group columns, pass in a row
      • apply(axis=0) => group rows, pass in a col
      • apply(lambda,result_type=None) => (default),only act when axis=1. Determine the return type by passing an empty series. If error, return type is dataframe. If series, return type is dataframe. If returns a list/scalar, final result will be a series
      • apply(lambda,result_type=reduce) => only act when axis=1, final result will be a series if possible (error or list/scalar)
      • apply(lambda,result_type=expand) => only act when axis=1, final result will be a dataframe if possible (error or series or list)
      • apply(lambda,result_type=broadcast) => only act when axis=1, final result will be broadcast to a dataframe
      def custom_function(x):
        return x[0]
      # result_type is None, lambda returns a series [silent error, empty_df passes empty series x, which does not have index 0, thus this series x is returned back]
      # result is df
      empty_df.apply(lambda x: custom_function(x), axis=1)
      """
          foo
      """
      
      # result_type is None, lambda returns a scalar
      # result is series
      df.apply(lambda x: custom_function(x), axis=1)
      """
      0    1
      1    1
      2    1
      dtype: int64
      """
      
      # result_type is None, lambda returns a list
      # result is series
      df.apply(lambda x: [1, 2], axis=1)
      """
      0    [1, 2]
      1    [1, 2]
      2    [1, 2]
      dtype: object
      """
      
      # result_type is None, lambda returns a series
      # result is df
      df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
      """
          foo  bar
      0    1    2
      1    1    2
      2    1    2
      """
      
      # result_type is expand, lambda returns a list
      # result is df
      df.apply(lambda x: [1, 2], axis=1, result_type='expand')
      """
         0  1
      0  1  2
      1  1  2
      2  1  2
      """
      
      # result_type is reduce, lambda returns a list
      # result is series
      df.apply(lambda x: [1, 2], axis=1, result_type='reduce')
      """
      0    [1, 2]
      1    [1, 2]
      2    [1, 2]
      dtype: object
      """
      
  10. transform

    • groupby
      • transform gets a column passed in and expands the result to the same shape(and same order) as argument; if the passed columns is the group column, x.name is the group name, otherwise, x.name is the column name
      • apply can have a column, a row or a dataframe passed in and the lambda result is not expanded. Thus it can return variable shape data frame
    • df
      • transform can get both as column and row, but the lambda function must return the results with the same shape as input
  11. filter

    df.filter(like='abc') # columns containing abc, case sensitive.
    df.filter(like='abc', axis=0) # index labels containing abc, case sensitive.
    df.filter(regex=r'[a-z]')
    df.filter(items=[])
    
  12. replace

    WARNING

    Before replace, ensure that dataframe or series values are with valid type

    # full match then replace
    df.replace('value_full_match', '123')
    # to replace None, use dict
    df.replace({None: '123'})
    # if regex, as long as the pattern is found, then replace
    df.replace('value_contain', '123' regex=True)
    
  13. merge vs join merge can work on columns, whereas join must work on indices

  14. groupby

    column first or function first are all okay

    • count: count non-NaN values in a group
    • nunique: count unique non-NaN values in a group
  15. pivot_table

    create columns from values in rows, given the below dataframe:

         yearPassed  policyType  count
    0       1990        1        2000
    1       1990        2        1400
    2       1990        3        1200
    3       1991        3        70
    4       1992        2        1000
    5       1992        3        800
    
    df = df.pivot_table(index=['yearPassed'], columns=['policyType'], values='count').fillna(0)
    df # column is a index
    
    policyType       1       2       3
    yearPassed
    1990        2000.0  1400.0  1200.0
    1991           0.0     0.0    70.0
    1992           0.0  1000.0   800.0
    
    df.columns = ['p1', 'p2', 'p3']
    df
                   p1      p2      p3
    yearPassed
    1990        2000.0  1400.0  1200.0
    1991           0.0     0.0    70.0
    1992           0.0  1000.0   800.0
    
  16. pd.melt

    Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.

      a b
    x 1 3
    y 2 4
    
    pd.melt(df.reset_index(), var_name='c2', value_name='c1', id_vars='index').set_index('index')
    
       c1  c2
    x  1   a
    x  3   b
    y  2   a
    y  4   b
    
  17. date wrangling

    • float epoch(unit, seconds) to pd.Timestamp

      pd.to_datetime(df[col], unit='s')
      
    • methods from pd.Timestamp

      WARNING

      pd.Timestamp cols' getting value (df[col].values) will results in np.datetime64 which does not contain timezone info

      # datetime.weekday()
      # 0 is monday
      # 6 is sunday
      df[timestamp_col].dt.weekday
      
      # get datetime.date object
      # datetime.date(2020, 6, 6)
      df[timestamp_col].dt.date
      
      # timezone conversion
      df[zone_aware_col] = pd.to_datetime(df[col], unit='s').dt.tz_localize('UTC')
      # before conversion, pd.Timestamp must have timezone assigned
      df[zone_aware_col].dt.tz_convert('Asia/Singapore')
      
  18. Problem solving

    Split into small frames

    answer_list = []
    
    def solve_problem_with_small_df(small_df):
      for index_label, row in small_df.iterrows():
        pass
    
    for index_label, small_df in df.groupby(key_col):
    
      solved_df = solve_problem_with_small_df(small_df)
      answer_list.append(solved_df)
    
    pd.concat(answer_list)
    

    example:

    df_golden.pivot_table(index='Field ID', values='Pages Gold', aggfunc=lambda x: ','.join(x)).reset_index()
    df_golden.groupby('Field ID')['Pages Gold'].apply(lambda x: ','.join(x)).reset_index()
    for idx, page_series in df_golden.groupby('Field ID')['Pages Gold']:
      l = ','.join(page_series)
    

    Vectorized method: count consecutive positive values

    a = [0,1,0,1,1,0,0,0,1,1,0,1,0]
    b = [1,0,0,0,1,1,0,1,1,1,0,0,0]
    
    df = pd.DataFrame(zip(a,b), columns=['a', 'b'])
    
    c = lambda y: y * (y.groupby((y != y.shift()).cumsum()).cumcount() + 1)
    
    # concat a, b to one column does not affect the algorithm
    # if 1st item of b is 0, final result is ensured to be 0
    # if 1st item of b is 1, this item must be the start of a new group, which is finally 1
    c(df.unstack()).unstack().T
    
  19. IO

    • to read a csv with desired type: pd.read_csv('file_path', dtype=str)
    • to save a csv: pd.to_csv('file_path', sep=',', index=False)
  20. Copy

    • pd.DataFrame(df) x
    • df.copy(deep=True) yes
  21. Insert

    df.insert(idx, column_name, column_value: Union[single, array, series])
    
  22. Aggregation

    df.groupby('col').agg(new_name=('col_name', 'agg_func_name'))
    
  23. Creation

    examples:

    pd.DataFrame([1, 2, 3, 4], columns=['a'])  # 4 rows, 1 columns
    pd.DataFrame({'key': 'value', 'key2': 'value2'}.items)  # 2 rows, 2 columns
    pd.DataFrame({'column': [1,2,3,4]})  # 4 rows, 1 column
    

    assign special list to cell and expand cell to column

    df = pd.DataFrame([1, 2, 3, 4], columns=['a'])  # 4 rows, 1 column
    df['a'] = df['a'].astype('object')  # then it is possible to assign list as cell value
    df.at[0, 'a'] = [1, 2, 3, 4]  # assign list as value, loc leads to error
    pd.DataFrame(df.loc[0, 'a'].to_dict()) # same as df
    
  24. fillna

    # fill nan in column c with values from column b
    df['c'] = df['c'].fillna(df['b'])
    
  25. str

    df['string'].str.split(' ').str[-1]  # '123 456'.split(' ')[-1]
    
Last Updated: 12/1/2022, 9:47:42 PM