# Basics

Quick reference to syntax usage. Please focus on system design, fundamental design of the language itself and algorithm.

# Loop

  1. range

    range(10) # 0 to 9
    reversed(range(10)) # 9 to 0
    for i in range(10):
        pass
    i == 9 # True
    
  2. zip

    Comparison between python 2 and 3

    python3: zip returns an iterator.
    python2: zip returns a list.

    zip([1,2,3,4]) # iterator of [(1,),(2,),(3,),(4,)]
    list(zip([1,2,3,4])) # [(1,),(2,),(3,),(4,)]
    
    zip([1,2,3], [4,5,6]) # [(1,4),(2,5),(3,6)]
    
    zip([1,2,3], [1,2,3,4,5,6,7]) # [(1,1), (2,2,), (3,3)]
    
    itertools.zip_longest([1,2], [4,5,6], fillvalue=None) # [(1,4),(2,5),(None, 6)]
    

# String

  1. format

    1. %

      # number format
      "%.2f%%" % (100 * 1 / 3) # 33.33%, %% to escape
      # positive decimal means right alignment for both string and numbers
      "%6.2f" % (33.33) # "  33.33" right align
      "%-6.2f" % (33.33) # "33.33  " left align
      "%x" % (8217) # 2019 decimal to hex
      ("%%%ds" % length) % string # indention as parameter
      "%*s" % (length, string)  # indention as parameter
      
      # dict format
      to_print = {'name': 'david', 'age': 16}
      "%(name)s, %(age)d" % (to_print)
      
    2. format

      "{0}".format(0.33) # "0.33"
      "{0:.0%}".format(0.33) # "33%"
      "{0:.2f}".format(0.33) # "0.33"
      "{0:6.2f}".format(0.33) # "  0.33"
      "{0:>6.2f}".format(0.33) # "  0.33"
      "{0:<6.2f}".format(0.33) # "0.33  "
      "{:6.2f}".format(0.33) # "  0.33"
      "{:,.2f}".format(1234) # "1,234.00"
      
      # string default left, numeric default right
      "{:8s}".format("guagua") # "guagua  "
      "{:>8s}".format("guagua") # "  guagua"
      '{1:>{0}}'.format(length, string) # indention as parameter
      
      "{value}".format(value=10)
      
      person = {"name": "guagua", "age": 10}
      "{name}, {age}".format(**person)
      
      "I " +
      "love " +
      "you"
      
    3. f

      TIP

      f-str is evaluated in runtime, so it is faster
      \ is not allowed

      name = "guagua"
      age = 10
      f"{name},{age:.2f}" # guagua,10.00
      # positive decimal not for justification
      f"{name},{age:6f}"  # guagua,10.000000
      # positive decimal means right justified
      f"{name},{age:6.2f}"  # guagua, 10.00
      # > means right justified (default for numeric value)
      f"{name},{age:>6.2f}"  # guagua, 10.00
      # < means left justified
      f"{name},{age:<6.2f}"  # guagua,10.00
      # > means left justified (default for string)
      f"start:{name:>10s},{age:6.2f}"  # start:    guagua, 10.00
      # - not working for left justification
      f"{name},{age:-6d}"  # guagua,    10
      f"{age:{length}}"  # pass indention as variable
      f"{name},{1234:,.2f}" # guagua,1,234.00
      
      f"{func(arg)}" # evaluation
      
      f"{{{age}}}" # {10}
      
      f"{ {age} }" # {10}
      
      class Test:
          def __init__(self):
              self.a = "111"
              pass
          def __str__(self):
              return f"{self.a}"
          def __repr__(self):
              return f"{self.a} hahah"
      test = Test
      f"{test}" # 111
      f"{test!r}" # 111 hahah
      
      (f"I "
       f"love "
       f"you") # I love you, no + sign needed
      
       f"I " \
       f"love " \
       f"you"
      
  2. arabic

    Class of bi-direction defines the display direction of a character.

    TIP

    default bidi text base direction in python and js engine are left-to-right. In pycharm, it is content-based. vscode's terminal is problematic. The arabic string is treated as normal string.

    left-to-right base direction:

    """
    1. arabic words display from right to left
    2. alphabets display from left to right
    3. base direction controls how words are connected
    3. in left-to-right base direction, alphabets put at
    existing text's right, even if the text is arabic
    4. arabic word[number] put at right if existing is non-arabic, otherwise at left
    5. \u200e, left to right mark, changes arabic[number] to be put at right
    6. \u200f, right to left mart, changes arabic[number] to be put at left
    7. \u202e + text + \u202c => txet
    8. \u202e + arabic + \u202c => arabic
    """
    ar = 'للصناعة'
    
    # existing is none, 123 as it is, 123|cursor-->
    # existing is 123 with ltr, arabic word put at right
    # ar[0] = 'ل'
    # 123|cursor-->'للصناعة'
    '123' + ar => '123للصناعة'
    'abc' + ar => 'abcللصناعة'
    
    # ar[0] = 'ل'
    # ar:   <--curser|'للصناعة'
    # existing is arabic, numbers[arabic] put at left
    # alphabets[punctuation] are always put at right
    # numbers: cursor-->|'للصناعة'
    # alphabets: 'للصناعة'|cursor-->
    ar + '123' => 'للصناعة123'
    ar + 'abc' => 'للصناعةabc'
    ar + '\n' => 'للصناعة\n'
    
    '123' + ar + '456' => '123للصناعة456'
    
    # \u200e changes numbers to be put at right
    # numbers: 'للصناعة'|cursor-->
    '123' + ar + '\u200E' + '456' => '123‎للصناعة‎456'
    
  3. Encoding

    # 'x': decimal to hex
    '%04x' % ord('我') => 6211
    chr(25105) => '我'
    '我'.encode('utf-8') => b'\xe6\x88\x91'
    
    # \xef\xbb\xbf byte order mark to indicate utf-8 encoding schema
    '我'.encode('utf-8-sig') => b'\xef\xbb\xbf\xe6\x88\x91'
    """
    \xfe\xff or \xff\xfe is byte order mark to indicate endian and encoding schema
    if reads \xff\xfe, then python knows utf-16 is using little endian
    if reads \xfe\xff, then python knows utf-16 is using big endian
    in this case, utf-16 is using little endian
    \u6211 in little endian: \x11\x62;
    bytes \x62 in ascii is b
    """
    '我'.encode('utf-16') => b'\xff\xfe\x11b'
    '我'.encode('utf-16le') => b'\x11b'
    '我'.encode('utf-16be') => b'b\x11'
    
  4. Raw string

raw strings are not 100% raw

r'\'  # error
r'\n' # \n
r'\"'  # \"
r'''123''''  # error
r'''12'3'''  # 12'3
  1. Common method

capitalize first letter: st.title()

# Syntax sugar

  1. if else

    x = 10 if a > b else 9
    # ternary operator(nested conditional assignment)
    sign = "positive" if num > 0 else "negative" if num < 0 else "zero"
    
    lambda x: True if x % 2 == 0 else False
    
    [x for x in y if x>0]
    
    [x if x>10 else 5 for x in y]
    
  2. tuple

    l = 10,  # initialized as (10,)
    
  3. raw string

    l = r'\'  # single slash cannot be rawed
    l = '\\'  # works
    l = r'\s'  # \s
    l = '\\s'  # \s
    
  4. & and |

    {1,2,3} & {3} => {3}
    {1,2,3} | {4} => {1,2,3,4}
    
    collections.Counter([1,2,3,3]) & collections.Counter([3,2,2]) => Counter({2: 1, 3: 1})
    collections.Counter([1,2,3,3]) | collections.Counter([3,2,2]) => Counter({1: 1, 2: 2, 3: 2})
    
  5. sort by two keys

    sorted(a_list, lambda x: (key1, key2))
    
  6. list concatenation

    l = []
    l[:0] = [1, 2, 3]  # [1, 2, 3]
    
  7. one liner

    stmt: simple_stmt | compound_stmt
    simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
    

    ; to separate non-compound statements on a single line; the grammar makes no allowance for a non-compound statement and a compound statement separated by a semicolon.

    example with compound for statement:

    python -c $'x=10\nfor i in range(x): print(i)'  # use \n to separate non-compound and compound
    
  8. in

    def __contains__(self, needle):
        for elem in self:
            if needle is elem or needle == elem:
                return True
        return False
    

# Error handling

TIP

If the code in the except block raises an unhandled exception, the before-caught exception will be thrown again with the message format:

original_exception

During handling of the above exception, another exception occurred

new_exception

try:
    code()
except Exception as e:
    code_throw_exception()

# Regex

  1. re.sub

    re.sub(r"[^a-b]", ' ', "mystring")
    
    re.sub('\n', '', '123\n456')  # 123456
    re.sub(r'\n', '', '123\n456')  # 123456, \n raw string is treated as new line in regex
    re.sub('\\n', '', '123\n456')  # 123456, \n raw string is treated as new line in regex
    
    # returns '-a-b--d-'
    # Empty matches for the pattern are replaced when adjacent to a previous non-empty match
    # five matches due to * |a|bx|d|
    re.sub('x*', '-', 'abxd')
    
    re.sub('.*', '123', 'abc')  # abc| => 123123
    
  2. re.findall

    re.findall("dss", "gddss") # ["dss"], a list
    re.findall("dss", "gdssdss") # ["dss", "dss"], a list
    re.findall("d(ss)", "gddss") # ["ss"], a list
    re.findall("(12)|(34)", "123445") # [('12', '34')]
    re.findall("(12)|(34)", "123445") # [('12', ''), ('', '34')]
    
    re.search("d(ss)", "gddss") # a match group, where 0 is the full match(dss), group 1 is ss.
    re.search("dss", "gdssdss") # only has group 0(dss)
    
    re.match("dss", "gddss") # None, re.match("dss") <=> re.search("^dss") avoiding using re.match
    
  3. re.MULTILINE

    if ^ in pattern, search every line
    invalid for re.match

  4. look

    Order

    • behind -> ahead
    • (?<=abc)123: first find 123 then behind-looking abc

    Trivial pattern

    • something(?<=something) and (?=something)something are trivial:
      • 123(?<=123)abc, 123(?=abc)abc (123abc)
      • 456(?<=123)abc(no matched string exists)
      • re.search("^(?<=abc)123") or re.match("(?<=abc)123") always matches nothing.
    • If look ahead has ahead, it must be look head's substring(or .). If look behind has behind, it must be look behind's substring.

    Flexibility

    • (?=something) and (?!something) are flexible: something can be any regex pattern
    • lookahead supports or operation, but lookbehind doesn't(fix width jumping back to save speed)
      • ^((?!123).)*$: from start to end, as long as there is no 123 inside. (negative lookahead has flexible ahead)

      • ^((?!123|456).)*$: from start to end, as long as there is no 123 nor 456 inside. (negative lookahead has flexible ahead)

      • (?:(?<=123)|(?<=456)) 123 or 456

      • (?<!123)(?<!456) not 123 and 456

  5. \b

    \b will only match if on one side there is a valid "word" character and on the other side not.

    )\b not matching ) word, as )'s two sides are all non-word

    a\b matching a word

  6. fuzzy match (from regex not re)

    Regex usually attempts an exact match, but sometimes an approximate, or “fuzzy”, match is needed, for those cases where the text being searched may contain errors in the form of inserted, deleted or substituted characters.

    A fuzzy regex specifies which types of errors are permitted, and, optionally, either the minimum and maximum or only the maximum permitted number of each type. (You cannot specify only a minimum.)

    The 3 types of error are:

    • Insertion, indicated by “i”
    • Deletion, indicated by “d”
    • Substitution, indicated by “s”

    In addition, “e” indicates any type of error.

    The fuzziness of a regex item is specified between “{” and “}” after the item.

    Examples:

    • foo match “foo” exactly
    • (?:foo){i} match “foo”, permitting insertions
    • (?:foo){d} match “foo”, permitting deletions
    • (?:foo){s} match “foo”, permitting substitutions
    • (?:foo){i,s} match “foo”, permitting insertions and substitutions
    • (?:foo){e} match “foo”, permitting errors

    If a certain type of error is specified, then any type not specified will not be permitted.

    In the following examples I’ll omit the item and write only the fuzziness:

    • {d<=3} permit at most 3 deletions, but no other types
    • {i<=1,s<=2} permit at most 1 insertion and at most 2 substitutions, but no deletions
    • {1<=e<=3} permit at least 1 and at most 3 errors
    • {i<=2,d<=2,e<=3} permit at most 2 insertions, at most 2 deletions, at most 3 errors in total, but no substitutions

    It’s also possible to state the costs of each type of error and the maximum permitted total cost.

    Examples:

    • {2i+2d+1s<=4} each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4
    • {i<=1,d<=1,s<=1,2i+2d+1s<=4} at most 1 insertion, at most 1 deletion, at most 1 substitution; each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4

    You can also use “<” instead of “<=” if you want an exclusive minimum or maximum.

    You can add a test to perform on a character that’s substituted or inserted.

    Examples:

    • {s<=2:[a-z]} at most 2 substitutions, which must be in the character set [a-z].
    • {s<=2,i<=3:\d} at most 2 substitutions, at most 3 insertions, which must be digits.

    By default, fuzzy matching searches for the first match that meets the given constraints. The ENHANCEMATCH flag will cause it to attempt to improve the fit (i.e. reduce the number of errors) of the match that it has found.

    The BESTMATCH flag will make it search for the best match instead.

    Further examples to note:

    • regex.search("(dog){e}", "cat and dog")[1] returns "cat" because that matches "dog" with 3 errors (an unlimited number of errors is permitted).
    • regex.search("(dog){e<=1}", "cat and dog")[1] returns " dog" (with a leading space) because that matches "dog" with 1 error, which is within the limit.
    • regex.search("(?e)(dog){e<=1}", "cat and dog")[1] returns "dog" (without a leading space) because the fuzzy search matches " dog" with 1 error, which is within the limit, and the (?e) then it attempts a better fit.

    In the first two examples there are perfect matches later in the string, but in neither case is it the first possible match.

    The match object has an attribute fuzzy_counts which gives the total number of substitutions, insertions and deletions.

    >>> # A 'raw' fuzzy match:
    >>> regex.fullmatch(r"(?:cats|cat){e<=1}", "cat").fuzzy_counts
    (0, 0, 1)
    >>> # 0 substitutions, 0 insertions, 1 deletion.
    
    >>> # A better match might be possible if the ENHANCEMATCH flag used:
    >>> regex.fullmatch(r"(?e)(?:cats|cat){e<=1}", "cat").fuzzy_counts
    (0, 0, 0)
    >>> # 0 substitutions, 0 insertions, 0 deletions.
    

    The match object also has an attribute fuzzy_changes which gives a tuple of the positions of the substitutions, insertions and deletions.

    >>> m = regex.search('(fuu){i<=2,d<=2,e<=5}', 'anaconda foo bar')
    >>> m
    <regex.Match object; span=(7, 10), match='a f', fuzzy_counts=(0, 2, 2)>
    >>> m.fuzzy_changes
    ([], [7, 8], [10, 11])
    

    What this means is that if the matched part of the string had been:

    'anacondfuuoo bar'
    

    it would’ve been an exact match.

    However, there were insertions at positions 7 and 8:

    'anaconda fuuoo bar'
            ^^
    

    and deletions at positions 10 and 11:

    'anaconda f~~oo bar'
               ^^
    

    So the actual string was:

    'anaconda foo bar'
    
  7. catastrophic backtracking

    re.search(r'^(\W*K\W*)+$', 'K  K  K  K  K  K  K  K 6')  # leads to catastrophic, because 1st \W* & 2nd \W* creates a lot of combinations
    re.search(r'^(\W*K)+\W*$', 'K  K  K  K  K  K  K  K 6')  # no, because \W*K with little combinations
    

# Arithmetic

  1. **, power
  2. val // num => floor
  3. -(-val // num) => ceil
  4. 449027587.27 - 315000000 != 134027587.27 from decimal import Decimal Decimal(449027587.27) - Decimal(315000000) == Decimal(134027587.27)
  5. ** > ~x > +x -x> * , / , // , % > + - > <<, >> > & > ^ > | > !=, ==

# 进制

  1. hex <=> decimal
    hex(12) # 0xc
    format(12, 'x') # c(without '0x' proceeding)
    int("0xc", 16) # 12
    
  2. bin <=> decimal
    bin(12) # 0b1100
    format(12, 'b') # 1100
    int("0b1100", 2) # 12
    

# List

# Basics

l1 = [1,2,3]
l1.extend([4,5,6])
l1.append(7)
[8] + l1

l1[0::2]
l1[::-1]

a = [1,2,3]
b = [4,5,6]
c = [*a, *b] # [1,2,3,4,5,6]

mixed_types = [1,2,3, 'abc']

# pydash

  • pydash.key_by and pydash.group_by keep the references to original item
  • pydash.key_by and pydash.group_by's iteratee evaluated to property if it is . delimited(True/False, if list with more than one item)
  • pydash.get accepts path in both . and list format

# Logging

  1. levels

    • CRITICAL
    • ERROR
    • WARNING
    • INFO
    • DEBUG
    • NOTSET
  2. mechanism

    import logging
    log = logging.getLogger(__name__)
    log.info("Hello, world")
    

    the message is turned into a LogRecord object and routed to a Handler object registered for this logger. The handler will then use a Formatter to turn the LogRecord into a string and emit that string.

  3. lazy evaluation

    logger.debug('this is a debug message %s', var)  # lazy evaluation(only if debug level)
    
    # always evaluation
    logger.debug(f'this is a debug message {var}')
    logger.debug('this is a debug message %s' % var)
    
  4. usage

    basicConfig: initializes logging and adds a Formatter and a Handler to the root logger, if not exists. (logging.info will calls basicConfig internally if no handler available)

    import logging
    import os
    
    # exports to stderr
    # ERROR:the.module.name:The log message
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
    

    OneLineExceptionFormatter

    import logging
    import os
    
    class OneLineExceptionFormatter(logging.Formatter):
        def formatException(self, exc_info):
            result = super().formatException(exc_info)
            return repr(result)
    
        def format(self, record):
            result = super().format(record)
            if record.exc_text:
                result = result.replace("\n", "")
            return result
    
    # default level for handler is NOTSET
    handler = logging.StreamHandler()
    # BASIC_FORMAT = "%(levelname)s:%(name)s:%(message)s"
    formatter = OneLineExceptionFormatter(logging.BASIC_FORMAT)
    handler.setFormatter(formatter)
    # default level for logger is NOTSET
    # no name given, returns root logger
    root = logging.getLogger()
    root.setLevel(os.environ.get("LOGLEVEL", "INFO"))
    root.addHandler(handler)
    
    # global try/catch
    try:
        exit(main())
    except Exception:
        # logging.exception == logging.error('', exc_info=True) == logging.error('', exc_info=False) + 'original exception stack trace, during handling another happen:' + print(traceback.format_exc())
        # exc_info contains the stack trace
        # if there is no other exception handler, output the
        # stacktrace is okay. Otherwise, log the error with
        # summary only, i.e.,
        # logging.execption(exc_True=False)
        logging.exception("Exception in main(): ")
        exit(1)
    

    ColoredLogger

    import logging
    
    class CustomFormatter(logging.Formatter):
        """Logging Formatter to add colors and count warning / errors"""
    
        grey = "\x1b[38;21m"
        yellow = "\x1b[33;21m"
        red = "\x1b[31;21m"
        bold_red = "\x1b[31;1m"
        reset = "\x1b[0m"
        format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)"
    
        FORMATS = {
            logging.DEBUG: grey + format + reset,
            logging.INFO: grey + format + reset,
            logging.WARNING: yellow + format + reset,
            logging.ERROR: red + format + reset,
            logging.CRITICAL: bold_red + format + reset
        }
    
        def format(self, record):
            log_fmt = self.FORMATS.get(record.levelno)
            formatter = logging.Formatter(log_fmt)
            return formatter.format(record)
    
    # default level for handler is NOTSET
    handler = logging.StreamHandler()
    formatter = CustomFormatter()
    handler.setFormatter(formatter)
    # default level for logger is NOTSET
    # no name given, returns root logger
    root = logging.getLogger()
    root.setLevel(os.environ.get("LOGLEVEL", "INFO"))
    root.addHandler(handler)
    
    # global try/catch
    try:
        exit(main())
    except Exception:
        # logging.exception == logging.error('', exc_info=True)
        # exc_info contains the stack trace
        logging.exception("Exception in main(): ")
        exit(1)
    

    Log to file

    import logging
    import logging.handlers
    import os
    
    handler = logging.handlers.WatchedFileHandler(
        os.environ.get("LOGFILE", "/var/log/yourapp.log"))
    formatter = logging.Formatter(logging.BASIC_FORMAT)
    handler.setFormatter(formatter)
    root = logging.getLogger()
    root.setLevel(os.environ.get("LOGLEVEL", "INFO"))
    root.addHandler(handler)
    
     # global try/catch
    try:
        exit(main())
    except Exception:
        logging.exception("Exception in main()")
        exit(1)
    

    load the logging configuration from a configuration file

    version: 1
    disable_existing_loggers: true
    formatters:
        simple:
            format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    handlers:
        console:
            class: logging.StreamHandler
            level: INFO
            formatter: simple
            stream: ext://sys.stdout
        file:
            class: logging.FileHandler
            level: DEBUG
            filename: logs/dbInteract.log
    loggers:
        # They override the existing ones
        # if disable_existing_loggers, the non-defined loggers won't work
        simpleExample:
            level: DEBUG
            handlers: [console]
            propagate: no
        root:
            level: DEBUG # loger level should be above/equal handler's level
            handlers: [console,file]
    
    import logging.config
    import yaml
    
    with open('./test.yml', 'r') as stream:
        config = yaml.load(stream, Loader=yaml.FullLoader)
    
    logging.config.dictConfig(config)
    
  5. remarks

    • When deploying to containers, try to keep things as simple as possible. Log to standard out/err and rely on your container host or orchestration platform to handle figuring out what to do with the logs
    • exporting to stderr, pycharm will mark the messages as red
    • gunicorn adds additional handlers whereas tensorflow will not if a root logger already exists
    • set up the logging sys after necessary packages are imported to take control of them
    • to disable a specific package: logging.getLogger('name').disabled = True
    • logging.info uses root logger
    • logging.getLogger() gets root logger

# import

Background

Module execution via import statement (i.e., import <modulename>):

  • sys.path is not modified in any way
  • __name__ is set to the absolute form of <modulename>
  • __package__ is set to the immediate parent package in <modulename>
  • __init__.py is evaluated for all packages (including its own for package modules)
  • __main__.py is not evaluated for package modules; the code is evaluated for code modules

Module execution via command line with filename (i.e., python <filename>):

  • sys.path is modified to include the final directory in <filename>
  • __name__ is set to __main__
  • __package__ is set to None
  • __init__.py is not evaluated for any package (including its own for package modules)
  • __main__.py is evaluated for package modules; the code is evaluated for code modules

Module execution via command line with modulename (i.e., python -m <modulename>):

  • sys.path is modified to include the current directory
  • __name__ is set to __main__
  • __package__ is set to the immediate parent package in<modulename>
  • __init__.py is evaluated for all packages (including its own for package modules)
  • __main__.py is evaluated for package modules; the code is evaluated for code modules
  1. import steps

    """
    project structure
    /usr/test(root):
    - app.py             # __package__ is ''
    - another_app.py     # __package__ is ''
    - moduleA(pakcage):
        - class_a.py     # __package__ is moduleA
        - class_b.py     # __package__ is moduleA
    """
    
    1. resolve relative steps: if there is from .xx import yy or from ..xx import yy etc, using __package__ to resolve. __package__ might be None, empty string or the package name. Different modules(files) can have different __package__ in the same python run. Modules at root folder have empty string as __package__, and other moduels have their package residing in as __package__. Note that if python program is run as a script(without -m), __package__ becomes None, so relative imports will not work.
      """
      app.py
      """
      print(f'__package__ is {__package__}')
      
      """
      another_app.py
      """
      import app
      
      """
      class_a.py
      """
      from .. import app
      
      # case 1
      # If a module(file) is at root, or top-level, __package__ is empty string
      (/user/test) $ python -m another_app
      >>> __package__ is ''
      
      # case 2
      # when a top-level module is run as the entry point as with -m, __package__ is empty string
      (/usr/test) $ python -m app
      >>> __package__ is ''
      
      # case 3
      # number of dots cannot exceed number of packages in __package__
      # class_a has 1 level package: moduleA, so .. does not work
      (/usr/test) $ python -m moduleA.class_a
      >>> ValueError: attempted relative import beyond top-level package
      # to make it work, switch root to /usr and make __package__ for class_a.py as test.moduleA
      (/usr) $ python -m test.moduelA.class_a
      >>> __package__ is test
      
      # case 4
      # if run the program as scripts, i.e., without -m, __package__ is None
      (/usr/test) $ python app.py
      >>> __package__ is None
      
    2. search from sys.path
      """
      class_a.py
      """
      import sys
      print(f'sys.path is {sys.path[0]}')
      
      # the directory containing the script, i.e., /usr/test/moduleA
      (/usr/test) $ python moduleA/class_a.py
      >>> sys.path is /usr/test/moduleA
      
      # current directory, i.e., /usr/test
      (/usr/test) $ python -m moduelA.class_a
      >>> sys.path[0] is /usr/test
      
      # interactive shell
      (/usr/test) $ python
      ~~~ import sys
      ~~~ sys.path[0]
      >>> ''
      
  2. __init__.py

    WARNING

    Since python3.3, an empty __init__.py is no longer needed to mark a folder as python package

    # moduleA's __init__.py is run
    from app.moduleA import method
    import app.moduleA
    
    # moduleA's __init__.py is not run
    import app.moduleA.method
    
  3. Reloading

    After code changes, running code is outdated, but error code referenced in stack trace is updated

    import importlib
    # only module can be reloaded
    importlib.reload(com.package.module)
    
    from com.package.module import method
    
  4. Variable control

    __all__ variable controls what can be imported by from x import *

  5. Miscellaneous

    from A.B.C import D
    from B.C import D  # suppose A in added in PYTHONPATH
    
    """
    Modify variables inside 1st D will not change those inside 2nd D, since __package__ of two Ds are different
    """
    
    import A
    A.B  # error as sub module cannot be accessed if not imported
    import A.B  # okay
    

# OOP

  1. is used to check if two variables refer to the same object

  2. self has access to class variables, but read-only:

    • recommends to get access to class methods using self for simplicity
    • recommends to get access to class variables using type(self) instead of self.__class__

    Comparison `type(self)` with `self.__class__`

    type() is the same as __class__ in python3
    in python2, if a class has no inheritance, only __class__ works

  3. child has access to parent's overridden method

    class Foo(Bar):
        def __init__(arg):
            super(Foo, self).__init__(arg)
        def baz(self, arg):
            # method implementation is from parent, but self refers to child
            return super(Foo, self).baz(arg)
    
  4. child has access to grandparent's overridden method

    class Bar(Cha):
        def __init__(arg):
            super(Bar, self).__init__(arg)
    class Foo(Bar):
        def __init__(arg):
            super(Foo, self).__init__(arg)
        def chz(self, arg):
            # method implementation is from grandparent, but self refers to child
            # note the 1st argument of super is Bar instead of Foo
            return super(Bar, self).chz(arg)
    
  5. type

    type of a class is type if no custom metaclass is given

    class PythonBlog:
        pass
    
    class PythonBlogSon(PythonBlog):
        pass
    
    type(PythonBlog)  # type
    type(PythonBlogSon)  # type
    
    class PythonBlog(metaclass=MyMeta)
        pass
    
    class PythonBlogSon(PythonBlog):
        pass
    
    type(PythonBLog)  # MyMeta
    type(PythonBlogSon)  # MyMeta
    
  6. __new__

    rarely used

    class Eel(object):
        MAX_EELS = 20
        n_eels = 0
    
        # memory allocation and static variable manipulation
        # called to create an instance
        def __new__(cls, *args, **kwargs):
            if cls.n_eels == cls.MAX_EELS:
                raise HovercraftFull()
            obj = super(Eel, cls).__new__(cls)
            cls.n_eels += 1
            return obj
    
  7. __call__

    class Foo:
        def __init__(self, a, b, c):
            pass
    
    class Goo:
        def __call__(self, a, b, c):
            pass
    
    Foo(1, 2, 3)  # __init__ is called
    
    goo = Goo()
    goo(1, 2, 3)  # __call__ is called
    
    
    class Singleton(type):
        _instances = {}
        def __call__(cls, *args, **kwargs):
            if cls not in cls._instances:
                cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
            return cls._instances[cls]
    
    class MyClass(metaclass=Singleton):
        pass
    
  8. __dict__

    stores class attributes for class and instance attributes for instance

    class Test:
        def __init__(self):
            self.a = 10
    
        def execute(self):
            return 10
    
    
    """
    mappingproxy({'__module__': '__main__',
                '__init__': <function __main__.Test.__init__(self)>,
                'execute': <function __main__.Test.execute(self)>,
                '__dict__': <attribute '__dict__' of 'Test' objects>,
                '__weakref__': <attribute '__weakref__' of 'Test' objects>,
                '__doc__': None})
    """
    Test.__dict__
    
    Test().__dict__  # {'a': 10}
    
  9. __wrapped__

    get access to wrapped underlying function

    import functools
    
    class Memoize:
        def __init__(self, func):
            self.func = func
            self.memo = dict()
            # update __module__, __name__, __qualname__, __annotations__ and __doc__
            # no update on __dict__
            functools.update_wrapper(self, func, updated=[])
    
        def __call__(self, *args):
            if args not in self.memo:
                self.memo[args] = self.func(args)
            else:
                print("cls decorator. You have printed this before")
            return self.memo[args]
    
    
    class CallCounter:
        def __init__(self, func):
            self.func = func
            self.calls = 0
            # update __module__, __name__, __qualname__, __annotations__ and __doc__
            # update __dict__, i.e., {**self.__dict__, **wrapped.__dict__}
            functools.update_wrapper(self, func)
    
        def __call__(self, *args, **kwargs):
            self.calls += 1
            return self.func(*args, **kwargs)
    
    @Memoize
    @CallCounter
    def doubleprint(x):
        for elem in x:
            print(elem + " " + elem)
    
    doubleprint  # Memoize at 0x7fa8034edcc0
    doubleprint.__wrapped__  # CallCounter at 0x7fa8019540b8
    doubleprint.__wrapped__.__wrapped__  # function itself
    
  10. __eq__ and __hash__

    class Number:
    
        def __init__(self, number):
            self.number = number
    
        def __eq__(self, other):
            """Overrides the default implementation"""
            if isinstance(other, Number):
                return self.number == other.number
    
            # swap to other's __eq__
            return NotImplemented
    
        # must return int
        def __hash__(self):
            """Overrides the default implementation"""
            return hash(tuple(sorted(self.__dict__.items())))
    
    n1 = Number(2)
    o1 = Other('other')
    
    # first try n1 then o1's __eq__, if Other is not a subclass of Number
    # first try 01 then n1's __eq__, if Other is a subclass of Number
    n1 == o1
    
  11. staticmethod is inherited

    class A:
    @staticmethod
    def a():
        print('A')
    
    class B(A):
        @staticmethod
        def b():
            print('b')
    
    B.a()  # A
    
  12. method resolution order

    definition

    class Child(Parent):  # determines method resolution order
        def method(self, arg):
            """
            child methods(including __init__ and other methods) follow method resolution order, i.e., mro. Note that mro can be both disconnected and connected:
            1. Disconnected mro: if there is no super().method(arg) in a class, method resolution is broken out as soon as this class's method call is finished
            2. Connected mro: if for every class excluding object, there is always a super().method(arg), methods in every generation are called
            """
            # super().method(arg) equivalence
            mro = type(self).mro()
            for next_class in mro[mro.index(Child) + 1:]: # slice to end
                if hasattr(next_class, 'method'):
                    next_class.method(self)
                    break
    

    mro examples

    #  mro example I
    class Parent(object):
        pass
    
    class ChildA(Parent):
        pass
    
    class ChildB(Parent):
        pass
    
    class Grandchild(ChildA, ChildB):
        pass
    
    Grandchild.__mro__  # mro: (Grandchild, ChildA, ChildB, Parent, object)
    
    # mro example II
    class Parent:
        def __init__(self, x):
            self.x = x
            print("initializing Parent")
    
    class ChildA(Parent):
        pass
    
    """
    if a class in mro misses the method, that class is skipped
    """
    a = ChildA(10)  # skips ChildA, and Parent's __init__ is called
    a.x  # 10
    
    # mro example III
    class Parent(object):
        def __init__(self, x):
            self.x = x
            print('initializing Parent')
    
        def gg(self):
            print('Parent!')
    
    class ChildA(Parent):
        def __init__(self):
            print('initializing ChildA')
    
    """
    mro for both __init__ and gg are both [ChildA, Parent]
    
    execution order of __init__: ChildA starts, ChildA ends
    execution order of gg: ChildA skips (no gg), Parent starts, Parent ends
    """
    a = ChildA()
    a.gg()  # works
    a.x  # error
    
    # mro example IV
    class Parent(object):
        def __init__(self):
            print('initializing Parent')
    
    class ChildA(Parent):
        def __init__(self):
            print('initializing ChildA')
            super().__init__()
    
    class ChildB(Parent):
        def __init__(self):
            print('initializing ChildB')
            super().__init__()
    
    class Grandchild(ChildA, ChildB):
        pass
    
    """
    connected mro in __init__ method
    
    mro is G, A, B, Parent
    
    execution order:
    G skips (as there is no __init__ method)
    A starts
    B starts
    Parent starts
    Parent ends
    B ends
    A ends
    
    execution result:
    initializing ChildA
    initializing ChildB
    initializing Parent
    """
    Grandchild()
    
    # mro example V
    class Parent(object):
        def __init__(self):
            print('initializing Parent')
    
    class ChildA(Parent):
        def __init__(self):
            print('initializing ChildA')
            # Parent.__init__(self) exists and miss super().__init__(), mro of __init__ becomes disconnected
            Parent.__init__(self)
    
    class ChildB(Parent):
        def __init__(self):
            print('initializing ChildB')
            super().__init__()
    
    class Grandchild(ChildA, ChildB):
        pass
    
    """
    disconnected mro in __init__ method
    
    mro is G, A, B, Parent
    
    execution order:
    G skips (as there is no __init__ method)
    A starts
    Parent starts
    Parent ends
    A ends
    
    execution result:
    initializing ChildA
    initializing Parent
    """
    Grandchild()
    
    # mro example VI
    class Parent(object):
        def gg(self):
            print('Parent!')
    
    class ChildA(Parent):
        pass
    
    class ChildB(Parent):
        def gg(self):
            print('Child B!')
    
    class Grandchild(ChildA, ChildB):
        pass
    
    """
    disconnected mro in gg method
    
    mro is G, A, B, Parent
    
    execution order:
    G skips (as there is no gg method)
    A skips (as there is no gg method)
    B starts
    B ends
    
    execution result:
    Child B!
    """
    g = Grandchild()
    g.gg()
    
  13. Enum

    An enumeration is a set of symbolic names (members) bound to unique, constant values

    from enum import Enum
    
    class TestEnum(Enum):
        a = 0
        b = 1
    
    class Test:
        a = 0
        b = 1
    
    # Enum is iterable
    list(TestEnum)  #[<TestEnum.a: 0>, <TestEnum.b: 1>]
    list(Test)  # error
    
    # Enum has value and name
    TestEnum.a  # <enum 'TestEnum'>
    Test.a  # 0
    TestEnum.a.value  # 0
    Test.a.value  # error
    TestEnum.a.name  # a
    Test.a.value  # error
    
    # Enum is unassign-able
    TestEnum.a = 10  # error
    Test.a = 10
    

    Enum values can be functions

    import functools
    from enum import Enum
    
    class Wrapper:
        def __init__(self, f):
            self.f = f
            # update __module__, __name__, __qualname__, __annotations__ and __doc__
            # update __dict__
            functools.update_wrapper(self, f)
    
        def __call__(self, *args, **kwargs):
            return self.c(*args, **kwargs)
    
        def __repr__(self):
            return self.function.__repr__()
    
    def fa():
        return 'A'
    
    class TestEnum(Enum):
        # make a as an attribute instead of method definition
        a = Wrapper(fa)
    
        def __call__(self, *args, **kwargs):
            return self.value(*args, **kwargs)
    
    TestEnum.a()  # 'A'
    

    Enum values can be class

    from enum import Enum
    
    class CA():
        def __init__(self, name):
            self.name = name
    
        def execute(self):
            return 'A'
    
    class TestEnum(Enum):
        a  = CA
        def __call__(self, *args, **kwargs):
            return self.value(*args, **kwargs)
    
    TestEnum.a().execute()  # 'A'
    

# Pickle

# general

Pickle of spacy.tokenizer.Tokenizer

After loading, Tokenizer will call token_match method, hence there should not be any dependency inside token_match on variables which are restored later than Tokenizer

dump: might call __getstate__ or __reduce__

load: might call __new__ or __setstate

Pickable objects after un-pickling are guarenteed to have the same value, but might produce different dump value from its original dump: link

# Cannot pickle

import functools
import pickle

def decorate(f):
    @functools.wraps(f)
    def wrapper(*args, **kwargs):
        print(pickle.dumps(f))
        return f(*args, **kwargs)
    return wrapper

@decorate
def main():
    print('A')

if __name__ == '__main__':
    """
    Pickle serializes only metadata about the function and then restores it from a code. But at this point hydra's main wrapper already has overwritten main function, so pickled function does not match function that it tries to restore

    Traceback (most recent call last):
    ...
    _pickle.PicklingError: Can't pickle <function main at 0x7ff1df907ee0>: it's not the same object as __main__.main
    """
    main()

# exec vs eval

exec("", scope): always return None. print or import takes side effect

eval: accepts expression only. assignment or statement(like if) are rejected

# Closure

function with environment

Scope

nonlocal and global are keywords that control variable scope. Note that a variable created in the main body of the Python code is a global variable and belongs to the global scope.

def generate_counter2():
    x = 0
    def add_one():
        # By making x non-local, Python finds it in the parent context and creates a closure for it again.
        nonlocal x
        x = x + 1
        return x
    return add_one

# closure = generate_counter2()
# closure.__closure__ => address 1
# closure()
# closure.__closure__ => address 2

# Yield

def gen():
    with some_fn() as manager:
        yield something

g = gen()
next(g)  # manager has not exited yet
next(g)  # goes outside with, and manager got exited

def flatten(sequence):
    """flatten a multi level list or something
    >>> list(flatten([1, [2], 3]))
    [1, 2, 3]
    >>> list(flatten([1, [2], [3, [4]]]))
    [1, 2, 3, 4]
    """
    for element in sequence:
        if hasattr(element, '__iter__'):
            yield from flatten(element)
        else:
            yield element


def writer():
    """A coroutine that writes data *sent* to it to fd, socket, etc."""
    while True:
        try:
            w = (yield)
            print('>> ', w)
        except StopIteration:
            print('Close generator!')
def writer_wrapper(sub_generator)
    # it establishes a transparent bidirectional connection between the caller and the sub-generator
    yield from sub_generator
w = writer()
next(w)
w.send('data1')
w.send('data2')
w.throw(StopIteration)

w = writer()
w_wrapper = writer_wrapper(w)
next(w_wrapper)
w_wrapper.send('data1')
w_wrapper.send('data2')
w_wrapper.throw(StopIteration)

# Coroutine and Future

Coroutine

There are two types of coroutines. The native coroutine and legacy generator based coroutine. The legacy coroutine is going to be removed in python 3.11

Native coroutine is with async/await syntax. It is executed only when it is awaited

Generator based coroutine is a old way in python 3.5 to create coroutine. It is with the syntac @asyncio.coroutine / yield from. @asyncio.coroutine enables the generator use yield from to call native coroutines, and also enables the generator to be called by native coroutines, for instance using an await expression. There is another decaroter @types.coroutine does the same thing except a little difference

async def b_sleep():
    return 100

@asyncio.coroutine
def a_sleep():
    print("doing something in async")
    yield from b_sleep()  # yield from native coroutine or generator based coroutine

@asyncio.coroutine
def a_sleep():
    print("doing something in async")
    yield  # yield None

await a_sleep()

asyncio can generate Futures, which are scheduled to run by event loop regardless of await.

# Running order

If there is no yield control to event loop, the running order is the same as await order. If there is yield control to event loop, Futures can be executed immediately.

To yield control to event loop:

await asyncio.sleep(1)

# yield None
@types.coroutine
def __sleep0():
    yield
await __sleep0()

# yield Future
await future

Details1 and Details2[https://github.com/python/cpython/blob/febf54bcf3fdc45ad84b4073e24bbaaee0ac8b2a/Lib/asyncio/tasks.py#L255] on yield control

import time
async def p(word):
    print(f'{time.time()} - {word}')


async def main1():
    loop = asyncio.get_event_loop()
    coro = p('await')
    task2 = loop.create_task(p('create_task2'))  # scheduled to next iteration
    await coro  # run coro only await
    await task2 # wait for final result

async def main2():
    loop = asyncio.get_event_loop()
    coro = p('await')
    task2 = loop.create_task(p('create_task2'))  # scheduled to next iteration
    task3 = loop.create_task(p('create_task3'))  # scheduled to next iteration
    await asyncio.sleep(1)  # loop got control, and runs task2 and task 3
    await coro  # run coro only await
    await task2 # wait for final result
    await task3 # wait for final result


# await > task2
await main1()

# task2 > task3 > await
await main2()

# Awaitable Objects

An awaitable object generally implements an __await__() method. Coroutine objects returned from async def functions are awaitable. The generator iterator objects returned from generators decorated with types.coroutine() or asyncio.coroutine() are also awaitable, but they do not implement __await__().

class MyObject:
    def __await__(self):
        # Must return an iterator
        yield from a_future # marks __await__ as an iterator

# Garbage collection

When an object is not referenced by any variable nor other object or becomes unreachable => it becomes garbage and python will make unused memory available for use, but when it gets returned to the OS depends.

To check memory allocation:

np.array([1,2,3]).nbytes ## check memory allocation for numpy


## check memory allocation for any object
def get_obj_size(obj):
    marked = {id(obj)}
    obj_q = [obj]
    sz = 0

    while obj_q:
        sz += sum(map(sys.getsizeof, obj_q))

        # Lookup all the object referred to by the object in obj_q.
        # See: https://docs.python.org/3.7/library/gc.html#gc.get_referents
        all_refr = ((id(o), o) for o in gc.get_referents(*obj_q))

        # Filter object that are already marked.
        # Using dict notation will prevent repeated objects.
        new_refr = {o_id: o for o_id, o in all_refr if o_id not in marked and not isinstance(o, type)}

        # The new obj_q will be the ones that were not marked,
        # and we will update marked with their ids so we will
        # not traverse them again.
        obj_q = new_refr.values()
        marked.update(new_refr.keys())

    return sz

To clear memory for a dict:

a = {'a': [1, 2, 3], 'b': [4, 5, 6]}

del a['a'] # cut the binding to references; {'b': [4, 5, 6]}
a.pop('a') # x = a['a']; del a['a']; return x
a.clear() # assignment the true value to empty + clear hash set; {}

To check number of references:

a = {'a': [1, 2, 3], 'b': [4, 5, 6]}

import gc
gc.get_referrers(a)

import sys
sys.getrefcount(a)  # 2, as number of references plus 1 when `a` is passed to this function

# Subprocess

A child process becomes zombie if it exits or fails after its parent died. Be careful to avoid creating zombie processes:

# 1. use call
subprocess.call(['grep', 'jdoe', '/etc/passwd'])  # wait for command to complete

# 2. use communicate
process = Popen(['ls', '-l', '/tmp'], stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()  # read data from stdout and stderr, until end-of-file is reached.

try:
    stdout, stderr = proc.communicate(timeout=15)  # if after 15s subprocess not closed, process is not killed
except TimeoutExpired:
    proc.kill()
    outs, errs = proc.communicate()

# 3. use wait
process= subprocess.Popen( ('ls', '-l', '/tmp'), stdout=subprocess.PIPE)
process.wait()

# 4. if no wait, make sure you don't keep references to the Popen objects.
# If python script exits, subprocess treats pid 1 as its parent
Popen(["sleep", "30"])

# Concurrent & Parallel

  1. multiprocessing is designed for parallelism

    Multi-args Concurrence Blocking Ordered-results
    apply X X
    apply_async X X
    map X
    map_async X X
    starmap
    starmap_async X X
    • apply(func, args, kwds): lets the func run in child process but blocks the main process; same as apply_async().get();
    • apply_async(func, args, kwds, callback, error_callback): schedules the func and returns a async result object; order is multiple tasks is indeterminate
    # this function is called in main process
    def a_back(progress, result, saver):
        progress.update()
        log.info('sth')
        process_result(result)
    
    # this function is called in main process
    def b_back(progress, exception, saver):
        progress.update()
        log.info('sth')
        process_exception(exception)
    
    # exit: pool.terminate
    with mp.Pool(processes=4) as pool:
        with tqdm(total=some_count) as progress:
            jobs = []
            saver = {}
            for i in a_iterator:
                job = pool.apply_async(func, args=(i, ), kwds=a_dict, callback=lambda r: a_back(progress, r, saver), error_callback=lambda e: b_back(progress, e, saver))
                jobs.append(job)
    
    
            # to handle exception thrown in child processes
            for j in jobs:
                try:
                    j.get()  # blocks main process, but other children are running in background
                except Exception as e:
                    # exception is still thrown even though it might be consumed once by error_callback
                    handle()
    
            # alternatively, if no interest in exceptions
            # pool.close()
            # pool.join()
    
    process_saver(saver)
    
    • map(func, [1,2,3,4]): this method chops the iterable into a number of chunks which it submits to the process pool as separate tasks; blocks the main process; order is guaranteed; map(func, [(1,2), (3,4)]) not working
    • map_async(func, [1,2], callback, error_callback): order is guaranteed
  2. fork vs spawn

    spawn is fork + execve: Ref

# Time

# Get

from datetime import datetime
import pytz

# gets current time at local time zone
datetime.now()

# gets current time at another time zone
datetime.now(pytz.timezone('America/Chicago'))

# Create

# creates a time with time zone, e.g., 2015-06-11
my_datetime = datetime(2015, 6, 11, 13, 30)
my_tz = pytz.timezone('America/Chicago')
good_dt = my_tz.localize(my_datetime) # replace(tz=my_tz) gives wrong result: https://stackoverflow.com/a/50613134/6845273

# Switch

from dateutil import parser
from dateutil import tz
# switch utc to GMT+08:00
raw_time_str = '2021-11-03T09:53:10.683Z'
from_zone = tz.gettz('UTC')
to_zone = tz.gettz('China/Shanghai')  # dateutil.tz is China/Shanghai, pytz is Asia/Shanghai
utc = parser.parse(raw_time_str)
utc = utc.replace(tzinfo=from_zone)
utc.astimezone(to_zone)

# Parser

import dateparser

# parse with custom format, custom locale, beginning year
dateparser.parse(time_string, date_formats=['%Y-%m-%d'], settings={'DATE_ORDER': 'DMY', 'RELATIVE_BASE': datetime(2001, 1, 1),'PARSERS': ['custom-formats']}, locales=['en'])

from dateutil import parser
parser.parse(time_string)
query dateutil dateparser
2016/09/01 1 1
2016-09-01 1 1
09/01/2016 1 1
09-01-2016 1 1
09012016 0 0
09/01/2016 15:20 1 1
09/01/2016 at 15h20 1 1
15 min ago 0 1
two hours ago 0 1
a day ago 0 1
tuesday 0 1
monday at noon 0 1
------------------------- ---------- ----------
total (12) 6 11

# Operation

# subtract
from datetime import datetime, timedelta
import pytz
d = datetime.today() - timedelta(days=days_to_subtract)

# create from timestamp
date = datetime.datetime.fromtimestamp(ten_digit_ts, pytz.timezone('Asia/Shanghai'))

# IDE

# Pycharm

  1. dependency

    adds local folder to library root [interpreter settings] => adds to interpreter path => adds to sys.path

    python setup.py => adds a soft link in site-packages => sys.path contains site-packages

    adds a folder as source root => adds to PYTHONPATH => adds to sys.path

    if PATH is edited in run config, its value would automatically be new_value:$PATH

  2. profiler

own time: time spent without childen

  1. locale

LC_CTYPE is empty by default in runner, and C in pycharm termimal

# Conda

channels can be modified at ~/.condarc

some activation magic at #CONDA_PREFIX/etc/conda/activate.d

useful commands:

# refresh a package
conda install --force-reinstall package

# view package dependencies
# or at https://libraries.io/
conda create --dry-run --json -n dummy package

# search multiple channels for packages
conda install -c channel1 -c channel2 package

# view current env path
echo $CONDA_PREFIX

# remove env
conda env remove -n env_name
conda env remove -p env_path

# pip

useful commands


# reinstall a package
pip install --force-reinstall package==version

# generate requirements.txt
pip freeze > requirements.txt

# ignore installed version
pip install -I

# search in pypi first
# then search in extra url
# for pip 20.xx, choosing the latest possible version in the combined set
# for pip 21.xx additional compatibility check is done
pip install --extra-index-url https://123.com

# view installation logs
TMPDIR=./tmp pip install --no-clean

# fresh install without using any cache(e.g., built/downloaded wheels)
pip install --no-cache-dir

# fresh install without using any binary => build binary locally
pip install --no-binary :all:

"""
When making build requirements available, pip does so in an isolated environment.
That is, pip does not install those requirements into the user’s site-packages, but
rather installs them in a temporary directory which it adds to the user’s sys.path
for the duration of the build. This ensures that build requirements are handled
independently of the user’s runtime environment. For example, a project that needs
a recent version of setuptools to build can still be installed, even if the user has
 an older version installed (and without silently replacing that version).

n certain cases, projects (or redistributors) may have workflows that explicitly
manage the build environment. For such workflows, build isolation can be
problematic. If this is the case, pip provides a --no-build-isolation
flag to disable build isolation. Users supplying this flag are responsible
for ensuring the build environment is managed appropriately (including ensuring
that all required build dependencies are installed).

build requirements are specifed in pyproject.toml(EP 518), e.g.,
[build-system]
requires = [
    "setuptools",
    "cython>=0.25,<3.0",
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
    "thinc>=8.0.12,<8.1.0",
    "blis>=0.4.0,<0.8.0",
    "pathy",
    "numpy>=1.15.0",
]
build-backend = "setuptools.build_meta"
"""
pip install --no-build-isolation

pipdeptree -fl  # examine depency tree

pip --dry-run

# yaml

ruamel is derivative of PyYAML and not actively developed. PyYAML supports the YAML 1.1 standard, ruamel supports YAML 1.2 as released in 2009.

More:

  • PyYAML YAML 1.2 status: https://github.com/yaml/pyyaml/issues/116

# Disable alias

import ruamel.yaml

class NonAliasingRTRepresenter(ruamel.yaml.representer.RoundTripRepresenter):
    def ignore_aliases(self, data):
        return True

yaml = ruamel.yaml.YAML()
yaml.Representer = NonAliasingRTRepresenter

yaml.dump(data, sys.stdout)

# Partial flow

YAML calls the indentation style “block” and the JSON style “flow”. Flow style can be used at any point within the block style.

import ruamel.yaml

def L(*l):
   ret = ruamel.yaml.comments.CommentedSeq(l)
   ret.fa.set_flow_style()
   return ret

yaml = ruamel.yaml.YAML()
data = {}
data['users'] = L()
data['users'].append('user2 key1')
data['users'].append('user2 key2')
data['users2'] = L('abc', L('user2 group1', 'user2 group2'))
yaml.dump(data, sys.stdout)

"""
users: [user2 key1, user2 key2]
users2: [abc, [user2 group1, user2 group2]]
"""

# Multiline

http://yaml-multiline.info/

# Miscellaneous

  1. strip

    "[1,2,3,4]".strip("[1,2,]") # "3,4", any combination
    
  2. find

    "123".find('1') # 0
    
  3. raw string

    r"\x" # "\\x"
    r'\' # SyntaxError
    r"\\" # "\\\\"
    "\s"  # "\\s"
    r"\s"  # "\\s"
    "\f"  # \x0c
    r"\f"  # \\f
    
  4. pathlib

    pathlib.Path.cwd() # current work directory
    pathlib.Path(__file__).resolve() # absolute path of current file
    p = Path("/home/user/Downloads/repo/test.txt")
    p.stem # test name without extension
    p.name # test.txt full name
    json.loads(pathlib.Path('123.json').read_text())
    
    os.getcwd()
    os.path.dirname(os.path.realpath(__file__)) # directory
    os.path.split(os.path.realpath(__file__)) # directory and filename
    
    pathlib.Path('123/456/789/12.txt').parents[0] = pathlib.Path('123/456/789/12.txt').parent
    pathlib.Path('123/456/789/12.txt').parents[1] = pathlib.Path('123/456/789/12.txt').parent.parent
    

    return of Path is a special class
    it can be used with /
    pathlib.Path.cwd() / "dir"

  5. shutil

    shutil.copyfileobj(f, sys.stdout) # similar like cat
    shutil.move(f, target) # target must not exist, and target can in different disk
    os.rename # target must not exist, and target in the same disk
    os.replace # target can exist, and target in the same disk
    
    Path(f).rename(target) # Unix: target can exist. Target can in different disk. Similar like mv command.
    Path(f).replace # target can exist in any os. Target can in different disk.
    
  6. dict

    {"a":10}.get(keyname, valueifkeynotexist)
    {"a": 10}.pop("b")  # error
    {"a": 10}.pop("b", None)  # ok
    
  7. encoding

    utf-8 excel contains dummy string, so using utf_8_sig

    open() 's encoding is decided by environment's locale and can be check by python -c 'import locale; print(locale.getpreferredencoding())', can be set by export LC_ALL=en_US.UTF-8 export LANG=en_US.UTF-8

    python 3.6, invalid_country.UTF-8 (this country not exsit in locale -a) leads preferredencoding to be ascii; python 3.7, this problem does not exist

    sys.stdout.encoding's encoding is decided by environment's locale and can be set by PYTHONIOENCODING=UTF-8

    b'123' -> '123':
    b'123'.decode('utf-8')

  8. string decomposition

    Takes , , è and for consideration:

    • unicodedata.normalize('NFD', text), canonical equivalence: è or

    • unicodedata.normalize('NFKD', text), compatibility equivalence: , , è and

    • unicodedata.decomposition(text), special format: , , è

    unicodedata.category(char) can be used to check if a character is unwanted accent mark

  9. doc

    • dir(object): check all attributes
    • object.attr.__doc__: check how to use
    • ?object.attr in jupyter to check how to use
    • inspect.getsource(object.method): check source code
  10. yield

    return a generator to save space and some code

    def countdown_gen(x):
        count = x
        while count > 0:
            yield count
            count -= 1
    
    g = countdown_gen(5)
    
    for item in g:
        print(item) # 5, 4, 3, 2, 1
    

    assign a value back to yield

    def getPrimes(number):
        while True:
            if isPrime(number):
                number = yield number # generator returns number, and assign value sent to number
            number += 1
    
    gen = getPrimes(10)
    gen.send(None) # first sent must be None
    # next(gen) => gen.send(None)
    # 11
    # execution stuck at yield
    
    
    gen.send(10) # number becomes 10 and starts another round iteration
    # 11
    # execution stuck at yield
    
    def translator():
        # load all the words in English language and the translation to 'other lang'
        my_words_dict = {'hello': 'hello in other language', 'dog': 'dog in other language'}
    
        while True:
            word = yield
            yield my_words_dict.get(word, 'Unknown word...')
    
    my_words_translator = translator()
    
    next(my_words_translator) # stuck in first yield
    print(my_words_translator.send('dog')) # assign value to word and stuck in send yield
    
    next(my_words_translator) # stuck in first yield again
    print(my_words_translator.send('cat'))
    

    throw exception

    def add_to_database(connection_string):
        db = mydatabaselibrary.connect(connection_string)
        cursor = db.cursor()
        try:
            while True:
                try:
                    row = yield
                    cursor.execute('INSERT INTO mytable VALUES(?, ?, ?)', row)
                except CommitException:
                    cursor.execute('COMMIT')
                except AbortException:
                    cursor.execute('ABORT')
        finally:
            cursor.execute('ABORT')
            db.close()
    
    db = add_to_database('bla')
    db.send(None) # opens a database connection
    db.send('bla') # insert a row
    db.throw(CommitException) # abort the transaction
    

    yield from: transparent two way channel between the caller and the sub-generator, more info, but one simple usage is generating values from a iterator

    def flatten(sequence):
    """flatten a multi level list or something
    >>> list(flatten([1, [2], 3]))
    [1, 2, 3]
    >>> list(flatten([1, [2], [3, [4]]]))
    [1, 2, 3, 4]
    """
    for element in sequence:
        if hasattr(element, '__iter__'):
            yield from flatten(element)
        else:
            yield element
    
  11. naming variable, package: identifier ::= (letter|"_") (letter | digit | "_")*, if breaking rules, interpreter error occurs

    Invalid Import

    from importlib import import_module
    __import__('module-name')
    
  12. declaration hoisting

    There is no hoisting in python

  13. lambda

    lambda *x: print(x)  # supports var_args
    
  14. json

    json.dump(ensure_ascii=True): saves into \u6211, which can be read in any encoding as \u6211
    json.loads(): handles \u6211 as

    json.dump({"text": "我"}, open('123.json', 'w'), ensure_ascii=True)
    with open('123.json', encoding='latin_1') as reader:  # any encoding
        read_text = reader.read()
        json_text = json.loads(read_text)
        print(read_text)  # \u6211
        print(json_text)  # 我
    
    json.dump({"text": "我"}, open('123.json', 'w'), ensure_ascii=False)
    with open('123.json', encoding='latin_1') as reader:  # encoding error
        read_text = reader.read()
        json_text = json.loads(read_text)
        print(read_text)  # {"text": "我"}
        print(json_text)  # {'text': 'æ\x88\x91'}
    

    Take care control characters while json.loads()

    json.loads('{"apple": "good\nfruit"}', strict=False)
    json.loads(r'{"apple": "good\nfruit"}')
    

    More examples

    json.loads('["\\u00b9"]')
    json.loads('["¹"]')
    
  15. split

    If sep is not specified or is None, a different splitting algorithm is applied: runs of consecutive whitespace are regarded as a single separator, and the result will contain no empty strings at the start or end if the string has leading or trailing whitespace. Consequently, splitting an empty string or a string consisting of just whitespace with a None separator returns [].

    ' a b c '.split()  # ['a', 'b', 'c']
    ' a b c '.split(' ')  # ['', 'a', 'b', 'c', '']
    
  16. variable scope

    def kk(value):
        def gg():
            print(value)
        gg()
        value = 10
        gg()
    
    kk(3)
    # 3
    # 10
    
Last Updated: 2/1/2024, 4:22:58 PM