# Basics

Quick reference to syntax usage. Please focus on system design, fundamental design of the language itself and algorithm.

# Loop

range

range(10) # 0 to 9
reversed(range(10)) # 9 to 0
for i in range(10):
    pass
i == 9 # True

zip

Comparison between python 2 and 3

python3: zip returns an iterator.
python2: zip returns a list.

zip([1,2,3,4]) # iterator of [(1,),(2,),(3,),(4,)]
list(zip([1,2,3,4])) # [(1,),(2,),(3,),(4,)]

zip([1,2,3], [4,5,6]) # [(1,4),(2,5),(3,6)]

zip([1,2,3], [1,2,3,4,5,6,7]) # [(1,1), (2,2,), (3,3)]

itertools.zip_longest([1,2], [4,5,6], fillvalue=None) # [(1,4),(2,5),(None, 6)]

# String

format

%

# number format
"%.2f%%" % (100 * 1 / 3) # 33.33%, %% to escape
# positive decimal means right alignment for both string and numbers
"%6.2f" % (33.33) # "  33.33" right align
"%-6.2f" % (33.33) # "33.33  " left align
"%x" % (8217) # 2019 decimal to hex
("%%%ds" % length) % string # indention as parameter
"%*s" % (length, string)  # indention as parameter

# dict format
to_print = {'name': 'david', 'age': 16}
"%(name)s, %(age)d" % (to_print)

format

"{0}".format(0.33) # "0.33"
"{0:.0%}".format(0.33) # "33%"
"{0:.2f}".format(0.33) # "0.33"
"{0:6.2f}".format(0.33) # "  0.33"
"{0:>6.2f}".format(0.33) # "  0.33"
"{0:<6.2f}".format(0.33) # "0.33  "
"{:6.2f}".format(0.33) # "  0.33"
"{:,.2f}".format(1234) # "1,234.00"

# string default left, numeric default right
"{:8s}".format("guagua") # "guagua  "
"{:>8s}".format("guagua") # "  guagua"
'{1:>{0}}'.format(length, string) # indention as parameter

"{value}".format(value=10)

person = {"name": "guagua", "age": 10}
"{name}, {age}".format(**person)

"I " +
"love " +
"you"

f

TIP

f-str is evaluated in runtime, so it is faster
\ is not allowed

name = "guagua"
age = 10
f"{name},{age:.2f}" # guagua,10.00
# positive decimal not for justification
f"{name},{age:6f}"  # guagua,10.000000
# positive decimal means right justified
f"{name},{age:6.2f}"  # guagua, 10.00
# > means right justified (default for numeric value)
f"{name},{age:>6.2f}"  # guagua, 10.00
# < means left justified
f"{name},{age:<6.2f}"  # guagua,10.00
# > means left justified (default for string)
f"start:{name:>10s},{age:6.2f}"  # start:    guagua, 10.00
# - not working for left justification
f"{name},{age:-6d}"  # guagua,    10
f"{age:{length}}"  # pass indention as variable
f"{name},{1234:,.2f}" # guagua,1,234.00

f"{func(arg)}" # evaluation

f"{{{age}}}" # {10}

f"{ {age} }" # {10}

class Test:
    def __init__(self):
        self.a = "111"
        pass
    def __str__(self):
        return f"{self.a}"
    def __repr__(self):
        return f"{self.a} hahah"
test = Test
f"{test}" # 111
f"{test!r}" # 111 hahah

(f"I "
 f"love "
 f"you") # I love you, no + sign needed

 f"I " \
 f"love " \
 f"you"

arabic

Class of bi-direction defines the display direction of a character.

TIP

default bidi text base direction in python and js engine are left-to-right. In pycharm, it is content-based. vscode's terminal is problematic. The arabic string is treated as normal string.

left-to-right base direction:

"""
1. arabic words display from right to left
2. alphabets display from left to right
3. base direction controls how words are connected
3. in left-to-right base direction, alphabets put at
existing text's right, even if the text is arabic
4. arabic word[number] put at right if existing is non-arabic, otherwise at left
5. \u200e, left to right mark, changes arabic[number] to be put at right
6. \u200f, right to left mart, changes arabic[number] to be put at left
7. \u202e + text + \u202c => txet
8. \u202e + arabic + \u202c => arabic
"""
ar = 'للصناعة'

# existing is none, 123 as it is, 123|cursor-->
# existing is 123 with ltr, arabic word put at right
# ar[0] = 'ل'
# 123|cursor-->'للصناعة'
'123' + ar => '123للصناعة'
'abc' + ar => 'abcللصناعة'

# ar[0] = 'ل'
# ar:   <--curser|'للصناعة'
# existing is arabic, numbers[arabic] put at left
# alphabets[punctuation] are always put at right
# numbers: cursor-->|'للصناعة'
# alphabets: 'للصناعة'|cursor-->
ar + '123' => 'للصناعة123'
ar + 'abc' => 'للصناعةabc'
ar + '\n' => 'للصناعة\n'

'123' + ar + '456' => '123للصناعة456'

# \u200e changes numbers to be put at right
# numbers: 'للصناعة'|cursor-->
'123' + ar + '\u200E' + '456' => '123‎للصناعة‎456'

Encoding

# 'x': decimal to hex
'%04x' % ord('我') => 6211
chr(25105) => '我'
'我'.encode('utf-8') => b'\xe6\x88\x91'

# \xef\xbb\xbf byte order mark to indicate utf-8 encoding schema
'我'.encode('utf-8-sig') => b'\xef\xbb\xbf\xe6\x88\x91'
"""
\xfe\xff or \xff\xfe is byte order mark to indicate endian and encoding schema
if reads \xff\xfe, then python knows utf-16 is using little endian
if reads \xfe\xff, then python knows utf-16 is using big endian
in this case, utf-16 is using little endian
\u6211 in little endian: \x11\x62;
bytes \x62 in ascii is b
"""
'我'.encode('utf-16') => b'\xff\xfe\x11b'
'我'.encode('utf-16le') => b'\x11b'
'我'.encode('utf-16be') => b'b\x11'

Raw string

raw strings are not 100% raw

r'\'  # error
r'\n' # \n
r'\"'  # \"
r'''123''''  # error
r'''12'3'''  # 12'3

Common method

capitalize first letter: st.title()

# Syntax sugar

if else

x = 10 if a > b else 9
# ternary operator(nested conditional assignment)
sign = "positive" if num > 0 else "negative" if num < 0 else "zero"

lambda x: True if x % 2 == 0 else False

[x for x in y if x>0]

[x if x>10 else 5 for x in y]

tuple
```
l = 10,  # initialized as (10,)
```

raw string

l = r'\'  # single slash cannot be rawed
l = '\\'  # works
l = r'\s'  # \s
l = '\\s'  # \s

& and |

{1,2,3} & {3} => {3}
{1,2,3} | {4} => {1,2,3,4}

collections.Counter([1,2,3,3]) & collections.Counter([3,2,2]) => Counter({2: 1, 3: 1})
collections.Counter([1,2,3,3]) | collections.Counter([3,2,2]) => Counter({1: 1, 2: 2, 3: 2})

sort by two keys
```
sorted(a_list, lambda x: (key1, key2))
```
list concatenation
```
l = []
l[:0] = [1, 2, 3]  # [1, 2, 3]
```
one liner
```
stmt: simple_stmt | compound_stmt
simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
```
; to separate non-compound statements on a single line; the grammar makes no allowance for a non-compound statement and a compound statement separated by a semicolon.

example with compound for statement:
```
python -c $'x=10\nfor i in range(x): print(i)'  # use \n to separate non-compound and compound
```

in

def __contains__(self, needle):
    for elem in self:
        if needle is elem or needle == elem:
            return True
    return False

# Error handling

TIP

If the code in the except block raises an unhandled exception, the before-caught exception will be thrown again with the message format:

original_exception

During handling of the above exception, another exception occurred

new_exception


try:
    code()
except Exception as e:
    code_throw_exception()

# Regex

re.sub

re.sub(r"[^a-b]", ' ', "mystring")

re.sub('\n', '', '123\n456')  # 123456
re.sub(r'\n', '', '123\n456')  # 123456, \n raw string is treated as new line in regex
re.sub('\\n', '', '123\n456')  # 123456, \n raw string is treated as new line in regex

# returns '-a-b--d-'
# Empty matches for the pattern are replaced when adjacent to a previous non-empty match
# five matches due to * |a|bx|d|
re.sub('x*', '-', 'abxd')

re.sub('.*', '123', 'abc')  # abc| => 123123

re.findall

re.findall("dss", "gddss") # ["dss"], a list
re.findall("dss", "gdssdss") # ["dss", "dss"], a list
re.findall("d(ss)", "gddss") # ["ss"], a list
re.findall("(12)|(34)", "123445") # [('12', '34')]
re.findall("(12)|(34)", "123445") # [('12', ''), ('', '34')]

re.search("d(ss)", "gddss") # a match group, where 0 is the full match(dss), group 1 is ss.
re.search("dss", "gdssdss") # only has group 0(dss)

re.match("dss", "gddss") # None, re.match("dss") <=> re.search("^dss") avoiding using re.match

re.MULTILINE

if ^ in pattern, search every line
invalid for re.match
look

Order
- behind -> ahead
- (?<=abc)123: first find 123 then behind-looking abc
Trivial pattern
- something(?<=something) and (?=something)something are trivial:
  - 123(?<=123)abc, 123(?=abc)abc (123abc)
  - 456(?<=123)abc(no matched string exists)
  - re.search("^(?<=abc)123") or re.match("(?<=abc)123") always matches nothing.
- If look ahead has ahead, it must be look head's substring(or .). If look behind has behind, it must be look behind's substring.
Flexibility
- (?=something) and (?!something) are flexible: something can be any regex pattern
- lookahead supports or operation, but lookbehind doesn't(fix width jumping back to save speed)
  - ^((?!123).)*$: from start to end, as long as there is no 123 inside. (negative lookahead has flexible ahead)
  - ^((?!123|456).)*$: from start to end, as long as there is no 123 nor 456 inside. (negative lookahead has flexible ahead)
  - (?:(?<=123)|(?<=456)) 123 or 456
  - (?<!123)(?<!456) not 123 and 456
\b

\b will only match if on one side there is a valid "word" character and on the other side not.

)\b not matching ) word, as )'s two sides are all non-word

a\b matching a word
fuzzy match (from regex not re)

Regex usually attempts an exact match, but sometimes an approximate, or “fuzzy”, match is needed, for those cases where the text being searched may contain errors in the form of inserted, deleted or substituted characters.

A fuzzy regex specifies which types of errors are permitted, and, optionally, either the minimum and maximum or only the maximum permitted number of each type. (You cannot specify only a minimum.)

The 3 types of error are:
- Insertion, indicated by “i”
- Deletion, indicated by “d”
- Substitution, indicated by “s”
In addition, “e” indicates any type of error.

The fuzziness of a regex item is specified between “{” and “}” after the item.

Examples:
- foo match “foo” exactly
- (?:foo){i} match “foo”, permitting insertions
- (?:foo){d} match “foo”, permitting deletions
- (?:foo){s} match “foo”, permitting substitutions
- (?:foo){i,s} match “foo”, permitting insertions and substitutions
- (?:foo){e} match “foo”, permitting errors
If a certain type of error is specified, then any type not specified will not be permitted.

In the following examples I’ll omit the item and write only the fuzziness:
- {d<=3} permit at most 3 deletions, but no other types
- {i<=1,s<=2} permit at most 1 insertion and at most 2 substitutions, but no deletions
- {1<=e<=3} permit at least 1 and at most 3 errors
- {i<=2,d<=2,e<=3} permit at most 2 insertions, at most 2 deletions, at most 3 errors in total, but no substitutions
It’s also possible to state the costs of each type of error and the maximum permitted total cost.

Examples:
- {2i+2d+1s<=4} each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4
- {i<=1,d<=1,s<=1,2i+2d+1s<=4} at most 1 insertion, at most 1 deletion, at most 1 substitution; each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4
You can also use “<” instead of “<=” if you want an exclusive minimum or maximum.

You can add a test to perform on a character that’s substituted or inserted.

Examples:
- {s<=2:[a-z]} at most 2 substitutions, which must be in the character set [a-z].
- {s<=2,i<=3:\d} at most 2 substitutions, at most 3 insertions, which must be digits.
By default, fuzzy matching searches for the first match that meets the given constraints. The ENHANCEMATCH flag will cause it to attempt to improve the fit (i.e. reduce the number of errors) of the match that it has found.

The BESTMATCH flag will make it search for the best match instead.

Further examples to note:
- regex.search("(dog){e}", "cat and dog")[1] returns "cat" because that matches "dog" with 3 errors (an unlimited number of errors is permitted).
- regex.search("(dog){e<=1}", "cat and dog")[1] returns " dog" (with a leading space) because that matches "dog" with 1 error, which is within the limit.
- regex.search("(?e)(dog){e<=1}", "cat and dog")[1] returns "dog" (without a leading space) because the fuzzy search matches " dog" with 1 error, which is within the limit, and the (?e) then it attempts a better fit.
In the first two examples there are perfect matches later in the string, but in neither case is it the first possible match.

The match object has an attribute fuzzy_counts which gives the total number of substitutions, insertions and deletions.
```
>>> # A 'raw' fuzzy match:
>>> regex.fullmatch(r"(?:cats|cat){e<=1}", "cat").fuzzy_counts
(0, 0, 1)
>>> # 0 substitutions, 0 insertions, 1 deletion.
```
```
>>> # A better match might be possible if the ENHANCEMATCH flag used:
>>> regex.fullmatch(r"(?e)(?:cats|cat){e<=1}", "cat").fuzzy_counts
(0, 0, 0)
>>> # 0 substitutions, 0 insertions, 0 deletions.
```
The match object also has an attribute fuzzy_changes which gives a tuple of the positions of the substitutions, insertions and deletions.
```
>>> m = regex.search('(fuu){i<=2,d<=2,e<=5}', 'anaconda foo bar')
>>> m
<regex.Match object; span=(7, 10), match='a f', fuzzy_counts=(0, 2, 2)>
>>> m.fuzzy_changes
([], [7, 8], [10, 11])
```
What this means is that if the matched part of the string had been:
```
'anacondfuuoo bar'
```
it would’ve been an exact match.

However, there were insertions at positions 7 and 8:
```
'anaconda fuuoo bar'
        ^^
```
and deletions at positions 10 and 11:
```
'anaconda f~~oo bar'
           ^^
```
So the actual string was:
```
'anaconda foo bar'
```

catastrophic backtracking

re.search(r'^(\W*K\W*)+$', 'K  K  K  K  K  K  K  K 6')  # leads to catastrophic, because 1st \W* & 2nd \W* creates a lot of combinations
re.search(r'^(\W*K)+\W*$', 'K  K  K  K  K  K  K  K 6')  # no, because \W*K with little combinations

# Arithmetic

**, power
val // num => floor
-(-val // num) => ceil
449027587.27 - 315000000 != 134027587.27 from decimal import Decimal Decimal(449027587.27) - Decimal(315000000) == Decimal(134027587.27)
** > ~x > +x -x> * , / , // , % > + - > <<, >> > & > ^ > | > !=, ==

# 进制

hex <=> decimal

hex(12) # 0xc
format(12, 'x') # c(without '0x' proceeding)
int("0xc", 16) # 12

bin <=> decimal

bin(12) # 0b1100
format(12, 'b') # 1100
int("0b1100", 2) # 12

# List

# Basics

l1 = [1,2,3]
l1.extend([4,5,6])
l1.append(7)
[8] + l1

l1[0::2]
l1[::-1]

a = [1,2,3]
b = [4,5,6]
c = [*a, *b] # [1,2,3,4,5,6]

mixed_types = [1,2,3, 'abc']

# pydash

pydash.key_by and pydash.group_by keep the references to original item
pydash.key_by and pydash.group_by's iteratee evaluated to property if it is . delimited(True/False, if list with more than one item)
pydash.get accepts path in both . and list format

# Logging

levels
- CRITICAL
- ERROR
- WARNING
- INFO
- DEBUG
- NOTSET
mechanism
```
import logging
log = logging.getLogger(__name__)
log.info("Hello, world")
```
the message is turned into a LogRecord object and routed to a Handler object registered for this logger. The handler will then use a Formatter to turn the LogRecord into a string and emit that string.

lazy evaluation

logger.debug('this is a debug message %s', var)  # lazy evaluation(only if debug level)

# always evaluation
logger.debug(f'this is a debug message {var}')
logger.debug('this is a debug message %s' % var)

usage

basicConfig: initializes logging and adds a Formatter and a Handler to the root logger, if not exists. (logging.info will calls basicConfig internally if no handler available)

import logging
import os

# exports to stderr
# ERROR:the.module.name:The log message
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

OneLineExceptionFormatter

import logging
import os

class OneLineExceptionFormatter(logging.Formatter):
    def formatException(self, exc_info):
        result = super().formatException(exc_info)
        return repr(result)

    def format(self, record):
        result = super().format(record)
        if record.exc_text:
            result = result.replace("\n", "")
        return result

# default level for handler is NOTSET
handler = logging.StreamHandler()
# BASIC_FORMAT = "%(levelname)s:%(name)s:%(message)s"
formatter = OneLineExceptionFormatter(logging.BASIC_FORMAT)
handler.setFormatter(formatter)
# default level for logger is NOTSET
# no name given, returns root logger
root = logging.getLogger()
root.setLevel(os.environ.get("LOGLEVEL", "INFO"))
root.addHandler(handler)

# global try/catch
try:
    exit(main())
except Exception:
    # logging.exception == logging.error('', exc_info=True) == logging.error('', exc_info=False) + 'original exception stack trace, during handling another happen:' + print(traceback.format_exc())
    # exc_info contains the stack trace
    # if there is no other exception handler, output the
    # stacktrace is okay. Otherwise, log the error with
    # summary only, i.e.,
    # logging.execption(exc_True=False)
    logging.exception("Exception in main(): ")
    exit(1)

ColoredLogger

import logging

class CustomFormatter(logging.Formatter):
    """Logging Formatter to add colors and count warning / errors"""

    grey = "\x1b[38;21m"
    yellow = "\x1b[33;21m"
    red = "\x1b[31;21m"
    bold_red = "\x1b[31;1m"
    reset = "\x1b[0m"
    format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)"

    FORMATS = {
        logging.DEBUG: grey + format + reset,
        logging.INFO: grey + format + reset,
        logging.WARNING: yellow + format + reset,
        logging.ERROR: red + format + reset,
        logging.CRITICAL: bold_red + format + reset
    }

    def format(self, record):
        log_fmt = self.FORMATS.get(record.levelno)
        formatter = logging.Formatter(log_fmt)
        return formatter.format(record)

# default level for handler is NOTSET
handler = logging.StreamHandler()
formatter = CustomFormatter()
handler.setFormatter(formatter)
# default level for logger is NOTSET
# no name given, returns root logger
root = logging.getLogger()
root.setLevel(os.environ.get("LOGLEVEL", "INFO"))
root.addHandler(handler)

# global try/catch
try:
    exit(main())
except Exception:
    # logging.exception == logging.error('', exc_info=True)
    # exc_info contains the stack trace
    logging.exception("Exception in main(): ")
    exit(1)

Log to file

import logging
import logging.handlers
import os

handler = logging.handlers.WatchedFileHandler(
    os.environ.get("LOGFILE", "/var/log/yourapp.log"))
formatter = logging.Formatter(logging.BASIC_FORMAT)
handler.setFormatter(formatter)
root = logging.getLogger()
root.setLevel(os.environ.get("LOGLEVEL", "INFO"))
root.addHandler(handler)

 # global try/catch
try:
    exit(main())
except Exception:
    logging.exception("Exception in main()")
    exit(1)

load the logging configuration from a configuration file

version: 1
disable_existing_loggers: true
formatters:
    simple:
        format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
handlers:
    console:
        class: logging.StreamHandler
        level: INFO
        formatter: simple
        stream: ext://sys.stdout
    file:
        class: logging.FileHandler
        level: DEBUG
        filename: logs/dbInteract.log
loggers:
    # They override the existing ones
    # if disable_existing_loggers, the non-defined loggers won't work
    simpleExample:
        level: DEBUG
        handlers: [console]
        propagate: no
    root:
        level: DEBUG # loger level should be above/equal handler's level
        handlers: [console,file]

import logging.config
import yaml

with open('./test.yml', 'r') as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(config)

remarks
- When deploying to containers, try to keep things as simple as possible. Log to standard out/err and rely on your container host or orchestration platform to handle figuring out what to do with the logs
- exporting to stderr, pycharm will mark the messages as red
- gunicorn adds additional handlers whereas tensorflow will not if a root logger already exists
- set up the logging sys after necessary packages are imported to take control of them
- to disable a specific package: logging.getLogger('name').disabled = True
- logging.info uses root logger
- logging.getLogger() gets root logger

# import

Background

Module execution via import statement (i.e., import <modulename>):

sys.path is not modified in any way
__name__ is set to the absolute form of <modulename>
__package__ is set to the immediate parent package in <modulename>
__init__.py is evaluated for all packages (including its own for package modules)
__main__.py is not evaluated for package modules; the code is evaluated for code modules

Module execution via command line with filename (i.e., python <filename>):

sys.path is modified to include the final directory in <filename>
__name__ is set to __main__
__package__ is set to None
__init__.py is not evaluated for any package (including its own for package modules)
__main__.py is evaluated for package modules; the code is evaluated for code modules

Module execution via command line with modulename (i.e., python -m <modulename>):

sys.path is modified to include the current directory
__name__ is set to __main__
__package__ is set to the immediate parent package in<modulename>
__init__.py is evaluated for all packages (including its own for package modules)
__main__.py is evaluated for package modules; the code is evaluated for code modules

import steps

"""
project structure
/usr/test(root):
- app.py             # __package__ is ''
- another_app.py     # __package__ is ''
- moduleA(pakcage):
    - class_a.py     # __package__ is moduleA
    - class_b.py     # __package__ is moduleA
"""

resolve relative steps: if there is from .xx import yy or from ..xx import yy etc, using __package__ to resolve. __package__ might be None, empty string or the package name. Different modules(files) can have different __package__ in the same python run. Modules at root folder have empty string as __package__, and other moduels have their package residing in as __package__. Note that if python program is run as a script(without -m), __package__ becomes None, so relative imports will not work.

"""
app.py
"""
print(f'__package__ is {__package__}')

"""
another_app.py
"""
import app

"""
class_a.py
"""
from .. import app

# case 1
# If a module(file) is at root, or top-level, __package__ is empty string
(/user/test) $ python -m another_app
>>> __package__ is ''

# case 2
# when a top-level module is run as the entry point as with -m, __package__ is empty string
(/usr/test) $ python -m app
>>> __package__ is ''

# case 3
# number of dots cannot exceed number of packages in __package__
# class_a has 1 level package: moduleA, so .. does not work
(/usr/test) $ python -m moduleA.class_a
>>> ValueError: attempted relative import beyond top-level package
# to make it work, switch root to /usr and make __package__ for class_a.py as test.moduleA
(/usr) $ python -m test.moduelA.class_a
>>> __package__ is test

# case 4
# if run the program as scripts, i.e., without -m, __package__ is None
(/usr/test) $ python app.py
>>> __package__ is None

search from sys.path

"""
class_a.py
"""
import sys
print(f'sys.path is {sys.path[0]}')

# the directory containing the script, i.e., /usr/test/moduleA
(/usr/test) $ python moduleA/class_a.py
>>> sys.path is /usr/test/moduleA

# current directory, i.e., /usr/test
(/usr/test) $ python -m moduelA.class_a
>>> sys.path[0] is /usr/test

# interactive shell
(/usr/test) $ python
~~~ import sys
~~~ sys.path[0]
>>> ''

__init__.py

WARNING

Since python3.3, an empty __init__.py is no longer needed to mark a folder as python package

# moduleA's __init__.py is run
from app.moduleA import method
import app.moduleA

# moduleA's __init__.py is not run
import app.moduleA.method

Reloading

After code changes, running code is outdated, but error code referenced in stack trace is updated

import importlib
# only module can be reloaded
importlib.reload(com.package.module)

from com.package.module import method

Variable control

__all__ variable controls what can be imported by from x import *

Miscellaneous

from A.B.C import D
from B.C import D  # suppose A in added in PYTHONPATH

"""
Modify variables inside 1st D will not change those inside 2nd D, since __package__ of two Ds are different
"""

import A
A.B  # error as sub module cannot be accessed if not imported
import A.B  # okay

# OOP

is used to check if two variables refer to the same object
self has access to class variables, but read-only:
- recommends to get access to class methods using self for simplicity
- recommends to get access to class variables using type(self) instead of self.__class__
Comparison `type(self)` with `self.__class__`

type() is the same as __class__ in python3
in python2, if a class has no inheritance, only __class__ works

child has access to parent's overridden method

class Foo(Bar):
    def __init__(arg):
        super(Foo, self).__init__(arg)
    def baz(self, arg):
        # method implementation is from parent, but self refers to child
        return super(Foo, self).baz(arg)

child has access to grandparent's overridden method

class Bar(Cha):
    def __init__(arg):
        super(Bar, self).__init__(arg)
class Foo(Bar):
    def __init__(arg):
        super(Foo, self).__init__(arg)
    def chz(self, arg):
        # method implementation is from grandparent, but self refers to child
        # note the 1st argument of super is Bar instead of Foo
        return super(Bar, self).chz(arg)

type

type of a class is type if no custom metaclass is given

class PythonBlog:
    pass

class PythonBlogSon(PythonBlog):
    pass

type(PythonBlog)  # type
type(PythonBlogSon)  # type

class PythonBlog(metaclass=MyMeta)
    pass

class PythonBlogSon(PythonBlog):
    pass

type(PythonBLog)  # MyMeta
type(PythonBlogSon)  # MyMeta

__new__

rarely used

class Eel(object):
    MAX_EELS = 20
    n_eels = 0

    # memory allocation and static variable manipulation
    # called to create an instance
    def __new__(cls, *args, **kwargs):
        if cls.n_eels == cls.MAX_EELS:
            raise HovercraftFull()
        obj = super(Eel, cls).__new__(cls)
        cls.n_eels += 1
        return obj

__call__

class Foo:
    def __init__(self, a, b, c):
        pass

class Goo:
    def __call__(self, a, b, c):
        pass

Foo(1, 2, 3)  # __init__ is called

goo = Goo()
goo(1, 2, 3)  # __call__ is called


class Singleton(type):
    _instances = {}
    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
        return cls._instances[cls]

class MyClass(metaclass=Singleton):
    pass

__dict__

stores class attributes for class and instance attributes for instance

class Test:
    def __init__(self):
        self.a = 10

    def execute(self):
        return 10


"""
mappingproxy({'__module__': '__main__',
            '__init__': <function __main__.Test.__init__(self)>,
            'execute': <function __main__.Test.execute(self)>,
            '__dict__': <attribute '__dict__' of 'Test' objects>,
            '__weakref__': <attribute '__weakref__' of 'Test' objects>,
            '__doc__': None})
"""
Test.__dict__

Test().__dict__  # {'a': 10}

__wrapped__

get access to wrapped underlying function

import functools

class Memoize:
    def __init__(self, func):
        self.func = func
        self.memo = dict()
        # update __module__, __name__, __qualname__, __annotations__ and __doc__
        # no update on __dict__
        functools.update_wrapper(self, func, updated=[])

    def __call__(self, *args):
        if args not in self.memo:
            self.memo[args] = self.func(args)
        else:
            print("cls decorator. You have printed this before")
        return self.memo[args]


class CallCounter:
    def __init__(self, func):
        self.func = func
        self.calls = 0
        # update __module__, __name__, __qualname__, __annotations__ and __doc__
        # update __dict__, i.e., {**self.__dict__, **wrapped.__dict__}
        functools.update_wrapper(self, func)

    def __call__(self, *args, **kwargs):
        self.calls += 1
        return self.func(*args, **kwargs)

@Memoize
@CallCounter
def doubleprint(x):
    for elem in x:
        print(elem + " " + elem)

doubleprint  # Memoize at 0x7fa8034edcc0
doubleprint.__wrapped__  # CallCounter at 0x7fa8019540b8
doubleprint.__wrapped__.__wrapped__  # function itself

__eq__ and __hash__

class Number:

    def __init__(self, number):
        self.number = number

    def __eq__(self, other):
        """Overrides the default implementation"""
        if isinstance(other, Number):
            return self.number == other.number

        # swap to other's __eq__
        return NotImplemented

    # must return int
    def __hash__(self):
        """Overrides the default implementation"""
        return hash(tuple(sorted(self.__dict__.items())))

n1 = Number(2)
o1 = Other('other')

# first try n1 then o1's __eq__, if Other is not a subclass of Number
# first try 01 then n1's __eq__, if Other is a subclass of Number
n1 == o1

staticmethod is inherited

class A:
@staticmethod
def a():
    print('A')

class B(A):
    @staticmethod
    def b():
        print('b')

B.a()  # A

method resolution order

definition

class Child(Parent):  # determines method resolution order
    def method(self, arg):
        """
        child methods(including __init__ and other methods) follow method resolution order, i.e., mro. Note that mro can be both disconnected and connected:
        1. Disconnected mro: if there is no super().method(arg) in a class, method resolution is broken out as soon as this class's method call is finished
        2. Connected mro: if for every class excluding object, there is always a super().method(arg), methods in every generation are called
        """
        # super().method(arg) equivalence
        mro = type(self).mro()
        for next_class in mro[mro.index(Child) + 1:]: # slice to end
            if hasattr(next_class, 'method'):
                next_class.method(self)
                break

mro examples

#  mro example I
class Parent(object):
    pass

class ChildA(Parent):
    pass

class ChildB(Parent):
    pass

class Grandchild(ChildA, ChildB):
    pass

Grandchild.__mro__  # mro: (Grandchild, ChildA, ChildB, Parent, object)

# mro example II
class Parent:
    def __init__(self, x):
        self.x = x
        print("initializing Parent")

class ChildA(Parent):
    pass

"""
if a class in mro misses the method, that class is skipped
"""
a = ChildA(10)  # skips ChildA, and Parent's __init__ is called
a.x  # 10

# mro example III
class Parent(object):
    def __init__(self, x):
        self.x = x
        print('initializing Parent')

    def gg(self):
        print('Parent!')

class ChildA(Parent):
    def __init__(self):
        print('initializing ChildA')

"""
mro for both __init__ and gg are both [ChildA, Parent]

execution order of __init__: ChildA starts, ChildA ends
execution order of gg: ChildA skips (no gg), Parent starts, Parent ends
"""
a = ChildA()
a.gg()  # works
a.x  # error

# mro example IV
class Parent(object):
    def __init__(self):
        print('initializing Parent')

class ChildA(Parent):
    def __init__(self):
        print('initializing ChildA')
        super().__init__()

class ChildB(Parent):
    def __init__(self):
        print('initializing ChildB')
        super().__init__()

class Grandchild(ChildA, ChildB):
    pass

"""
connected mro in __init__ method

mro is G, A, B, Parent

execution order:
G skips (as there is no __init__ method)
A starts
B starts
Parent starts
Parent ends
B ends
A ends

execution result:
initializing ChildA
initializing ChildB
initializing Parent
"""
Grandchild()

# mro example V
class Parent(object):
    def __init__(self):
        print('initializing Parent')

class ChildA(Parent):
    def __init__(self):
        print('initializing ChildA')
        # Parent.__init__(self) exists and miss super().__init__(), mro of __init__ becomes disconnected
        Parent.__init__(self)

class ChildB(Parent):
    def __init__(self):
        print('initializing ChildB')
        super().__init__()

class Grandchild(ChildA, ChildB):
    pass

"""
disconnected mro in __init__ method

mro is G, A, B, Parent

execution order:
G skips (as there is no __init__ method)
A starts
Parent starts
Parent ends
A ends

execution result:
initializing ChildA
initializing Parent
"""
Grandchild()

# mro example VI
class Parent(object):
    def gg(self):
        print('Parent!')

class ChildA(Parent):
    pass

class ChildB(Parent):
    def gg(self):
        print('Child B!')

class Grandchild(ChildA, ChildB):
    pass

"""
disconnected mro in gg method

mro is G, A, B, Parent

execution order:
G skips (as there is no gg method)
A skips (as there is no gg method)
B starts
B ends

execution result:
Child B!
"""
g = Grandchild()
g.gg()

Enum

An enumeration is a set of symbolic names (members) bound to unique, constant values

from enum import Enum

class TestEnum(Enum):
    a = 0
    b = 1

class Test:
    a = 0
    b = 1

# Enum is iterable
list(TestEnum)  #[<TestEnum.a: 0>, <TestEnum.b: 1>]
list(Test)  # error

# Enum has value and name
TestEnum.a  # <enum 'TestEnum'>
Test.a  # 0
TestEnum.a.value  # 0
Test.a.value  # error
TestEnum.a.name  # a
Test.a.value  # error

# Enum is unassign-able
TestEnum.a = 10  # error
Test.a = 10

Enum values can be functions

import functools
from enum import Enum

class Wrapper:
    def __init__(self, f):
        self.f = f
        # update __module__, __name__, __qualname__, __annotations__ and __doc__
        # update __dict__
        functools.update_wrapper(self, f)

    def __call__(self, *args, **kwargs):
        return self.c(*args, **kwargs)

    def __repr__(self):
        return self.function.__repr__()

def fa():
    return 'A'

class TestEnum(Enum):
    # make a as an attribute instead of method definition
    a = Wrapper(fa)

    def __call__(self, *args, **kwargs):
        return self.value(*args, **kwargs)

TestEnum.a()  # 'A'

Enum values can be class

from enum import Enum

class CA():
    def __init__(self, name):
        self.name = name

    def execute(self):
        return 'A'

class TestEnum(Enum):
    a  = CA
    def __call__(self, *args, **kwargs):
        return self.value(*args, **kwargs)

TestEnum.a().execute()  # 'A'

# Pickle

# general

Pickle of spacy.tokenizer.Tokenizer

After loading, Tokenizer will call token_match method, hence there should not be any dependency inside token_match on variables which are restored later than Tokenizer

dump: might call __getstate__ or __reduce__

load: might call __new__ or __setstate

Pickable objects after un-pickling are guarenteed to have the same value, but might produce different dump value from its original dump: link

# Cannot pickle

import functools
import pickle

def decorate(f):
    @functools.wraps(f)
    def wrapper(*args, **kwargs):
        print(pickle.dumps(f))
        return f(*args, **kwargs)
    return wrapper

@decorate
def main():
    print('A')

if __name__ == '__main__':
    """
    Pickle serializes only metadata about the function and then restores it from a code. But at this point hydra's main wrapper already has overwritten main function, so pickled function does not match function that it tries to restore

    Traceback (most recent call last):
    ...
    _pickle.PicklingError: Can't pickle <function main at 0x7ff1df907ee0>: it's not the same object as __main__.main
    """
    main()

# exec vs eval

exec("", scope): always return None. print or import takes side effect

eval: accepts expression only. assignment or statement(like if) are rejected

# Closure

function with environment

Scope

nonlocal and global are keywords that control variable scope. Note that a variable created in the main body of the Python code is a global variable and belongs to the global scope.

def generate_counter2():
    x = 0
    def add_one():
        # By making x non-local, Python finds it in the parent context and creates a closure for it again.
        nonlocal x
        x = x + 1
        return x
    return add_one

# closure = generate_counter2()
# closure.__closure__ => address 1
# closure()
# closure.__closure__ => address 2

# Yield

def gen():
    with some_fn() as manager:
        yield something

g = gen()
next(g)  # manager has not exited yet
next(g)  # goes outside with, and manager got exited

def flatten(sequence):
    """flatten a multi level list or something
    >>> list(flatten([1, [2], 3]))
    [1, 2, 3]
    >>> list(flatten([1, [2], [3, [4]]]))
    [1, 2, 3, 4]
    """
    for element in sequence:
        if hasattr(element, '__iter__'):
            yield from flatten(element)
        else:
            yield element


def writer():
    """A coroutine that writes data *sent* to it to fd, socket, etc."""
    while True:
        try:
            w = (yield)
            print('>> ', w)
        except StopIteration:
            print('Close generator!')
def writer_wrapper(sub_generator)
    # it establishes a transparent bidirectional connection between the caller and the sub-generator
    yield from sub_generator
w = writer()
next(w)
w.send('data1')
w.send('data2')
w.throw(StopIteration)

w = writer()
w_wrapper = writer_wrapper(w)
next(w_wrapper)
w_wrapper.send('data1')
w_wrapper.send('data2')
w_wrapper.throw(StopIteration)

# Coroutine and Future

Coroutine

There are two types of coroutines. The native coroutine and legacy generator based coroutine. The legacy coroutine is going to be removed in python 3.11

Native coroutine is with async/await syntax. It is executed only when it is awaited

Generator based coroutine is a old way in python 3.5 to create coroutine. It is with the syntac @asyncio.coroutine / yield from. @asyncio.coroutine enables the generator use yield from to call native coroutines, and also enables the generator to be called by native coroutines, for instance using an await expression. There is another decaroter @types.coroutine does the same thing except a little difference

async def b_sleep():
    return 100

@asyncio.coroutine
def a_sleep():
    print("doing something in async")
    yield from b_sleep()  # yield from native coroutine or generator based coroutine

@asyncio.coroutine
def a_sleep():
    print("doing something in async")
    yield  # yield None

await a_sleep()

asyncio can generate Futures, which are scheduled to run by event loop regardless of await.

# Running order

If there is no yield control to event loop, the running order is the same as await order. If there is yield control to event loop, Futures can be executed immediately.

To yield control to event loop:

await asyncio.sleep(1)

# yield None
@types.coroutine
def __sleep0():
    yield
await __sleep0()

# yield Future
await future

Details1 and Details2[https://github.com/python/cpython/blob/febf54bcf3fdc45ad84b4073e24bbaaee0ac8b2a/Lib/asyncio/tasks.py#L255] on yield control

import time
async def p(word):
    print(f'{time.time()} - {word}')


async def main1():
    loop = asyncio.get_event_loop()
    coro = p('await')
    task2 = loop.create_task(p('create_task2'))  # scheduled to next iteration
    await coro  # run coro only await
    await task2 # wait for final result

async def main2():
    loop = asyncio.get_event_loop()
    coro = p('await')
    task2 = loop.create_task(p('create_task2'))  # scheduled to next iteration
    task3 = loop.create_task(p('create_task3'))  # scheduled to next iteration
    await asyncio.sleep(1)  # loop got control, and runs task2 and task 3
    await coro  # run coro only await
    await task2 # wait for final result
    await task3 # wait for final result


# await > task2
await main1()

# task2 > task3 > await
await main2()

# Awaitable Objects

An awaitable object generally implements an __await__() method. Coroutine objects returned from async def functions are awaitable. The generator iterator objects returned from generators decorated with types.coroutine() or asyncio.coroutine() are also awaitable, but they do not implement __await__().

class MyObject:
    def __await__(self):
        # Must return an iterator
        yield from a_future # marks __await__ as an iterator

# Garbage collection

When an object is not referenced by any variable nor other object or becomes unreachable => it becomes garbage and python will make unused memory available for use, but when it gets returned to the OS depends.

To check memory allocation:

np.array([1,2,3]).nbytes ## check memory allocation for numpy


## check memory allocation for any object
def get_obj_size(obj):
    marked = {id(obj)}
    obj_q = [obj]
    sz = 0

    while obj_q:
        sz += sum(map(sys.getsizeof, obj_q))

        # Lookup all the object referred to by the object in obj_q.
        # See: https://docs.python.org/3.7/library/gc.html#gc.get_referents
        all_refr = ((id(o), o) for o in gc.get_referents(*obj_q))

        # Filter object that are already marked.
        # Using dict notation will prevent repeated objects.
        new_refr = {o_id: o for o_id, o in all_refr if o_id not in marked and not isinstance(o, type)}

        # The new obj_q will be the ones that were not marked,
        # and we will update marked with their ids so we will
        # not traverse them again.
        obj_q = new_refr.values()
        marked.update(new_refr.keys())

    return sz

To clear memory for a dict:

a = {'a': [1, 2, 3], 'b': [4, 5, 6]}

del a['a'] # cut the binding to references; {'b': [4, 5, 6]}
a.pop('a') # x = a['a']; del a['a']; return x
a.clear() # assignment the true value to empty + clear hash set; {}

To check number of references:

a = {'a': [1, 2, 3], 'b': [4, 5, 6]}

import gc
gc.get_referrers(a)

import sys
sys.getrefcount(a)  # 2, as number of references plus 1 when `a` is passed to this function

# Subprocess

A child process becomes zombie if it exits or fails after its parent died. Be careful to avoid creating zombie processes:

# 1. use call
subprocess.call(['grep', 'jdoe', '/etc/passwd'])  # wait for command to complete

# 2. use communicate
process = Popen(['ls', '-l', '/tmp'], stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate()  # read data from stdout and stderr, until end-of-file is reached.

try:
    stdout, stderr = proc.communicate(timeout=15)  # if after 15s subprocess not closed, process is not killed
except TimeoutExpired:
    proc.kill()
    outs, errs = proc.communicate()

# 3. use wait
process= subprocess.Popen( ('ls', '-l', '/tmp'), stdout=subprocess.PIPE)
process.wait()

# 4. if no wait, make sure you don't keep references to the Popen objects.
# If python script exits, subprocess treats pid 1 as its parent
Popen(["sleep", "30"])

# Concurrent & Parallel

multiprocessing is designed for parallelism

	Multi-args	Concurrence	Blocking	Ordered-results
apply	✔	X	✔	X
apply_async	✔	✔	X	X
map	X	✔	✔	✔
map_async	X	✔	X	✔
starmap	✔	✔	✔	✔
starmap_async	✔	✔	X	X

apply(func, args, kwds): lets the func run in child process but blocks the main process; same as apply_async().get();
apply_async(func, args, kwds, callback, error_callback): schedules the func and returns a async result object; order is multiple tasks is indeterminate

# this function is called in main process
def a_back(progress, result, saver):
    progress.update()
    log.info('sth')
    process_result(result)

# this function is called in main process
def b_back(progress, exception, saver):
    progress.update()
    log.info('sth')
    process_exception(exception)

# exit: pool.terminate
with mp.Pool(processes=4) as pool:
    with tqdm(total=some_count) as progress:
        jobs = []
        saver = {}
        for i in a_iterator:
            job = pool.apply_async(func, args=(i, ), kwds=a_dict, callback=lambda r: a_back(progress, r, saver), error_callback=lambda e: b_back(progress, e, saver))
            jobs.append(job)


        # to handle exception thrown in child processes
        for j in jobs:
            try:
                j.get()  # blocks main process, but other children are running in background
            except Exception as e:
                # exception is still thrown even though it might be consumed once by error_callback
                handle()

        # alternatively, if no interest in exceptions
        # pool.close()
        # pool.join()

process_saver(saver)

map(func, [1,2,3,4]): this method chops the iterable into a number of chunks which it submits to the process pool as separate tasks; blocks the main process; order is guaranteed; map(func, [(1,2), (3,4)]) not working
map_async(func, [1,2], callback, error_callback): order is guaranteed

fork vs spawn

spawn is fork + execve: Ref

# Time

# Get

from datetime import datetime
import pytz

# gets current time at local time zone
datetime.now()

# gets current time at another time zone
datetime.now(pytz.timezone('America/Chicago'))

# Create

# creates a time with time zone, e.g., 2015-06-11
my_datetime = datetime(2015, 6, 11, 13, 30)
my_tz = pytz.timezone('America/Chicago')
good_dt = my_tz.localize(my_datetime) # replace(tz=my_tz) gives wrong result: https://stackoverflow.com/a/50613134/6845273

# Switch

from dateutil import parser
from dateutil import tz
# switch utc to GMT+08:00
raw_time_str = '2021-11-03T09:53:10.683Z'
from_zone = tz.gettz('UTC')
to_zone = tz.gettz('China/Shanghai')  # dateutil.tz is China/Shanghai, pytz is Asia/Shanghai
utc = parser.parse(raw_time_str)
utc = utc.replace(tzinfo=from_zone)
utc.astimezone(to_zone)

# Parser

import dateparser

# parse with custom format, custom locale, beginning year
dateparser.parse(time_string, date_formats=['%Y-%m-%d'], settings={'DATE_ORDER': 'DMY', 'RELATIVE_BASE': datetime(2001, 1, 1),'PARSERS': ['custom-formats']}, locales=['en'])

from dateutil import parser
parser.parse(time_string)

query	dateutil	dateparser
2016/09/01	1	1
2016-09-01	1	1
09/01/2016	1	1
09-01-2016	1	1
09012016	0	0
09/01/2016 15:20	1	1
09/01/2016 at 15h20	1	1
15 min ago	0	1
two hours ago	0	1
a day ago	0	1
tuesday	0	1
monday at noon	0	1
-------------------------	----------	----------
total (12)	6	11

# Operation

# subtract
from datetime import datetime, timedelta
import pytz
d = datetime.today() - timedelta(days=days_to_subtract)

# create from timestamp
date = datetime.datetime.fromtimestamp(ten_digit_ts, pytz.timezone('Asia/Shanghai'))

# IDE

# Pycharm

dependency

adds local folder to library root [interpreter settings] => adds to interpreter path => adds to sys.path

python setup.py => adds a soft link in site-packages => sys.path contains site-packages

adds a folder as source root => adds to PYTHONPATH => adds to sys.path

if PATH is edited in run config, its value would automatically be new_value:$PATH
profiler

own time: time spent without childen

locale

LC_CTYPE is empty by default in runner, and C in pycharm termimal

# Conda

channels can be modified at ~/.condarc

some activation magic at #CONDA_PREFIX/etc/conda/activate.d

useful commands:

# refresh a package
conda install --force-reinstall package

# view package dependencies
# or at https://libraries.io/
conda create --dry-run --json -n dummy package

# search multiple channels for packages
conda install -c channel1 -c channel2 package

# view current env path
echo $CONDA_PREFIX

# remove env
conda env remove -n env_name
conda env remove -p env_path

# pip

useful commands


# reinstall a package
pip install --force-reinstall package==version

# generate requirements.txt
pip freeze > requirements.txt

# ignore installed version
pip install -I

# search in pypi first
# then search in extra url
# for pip 20.xx, choosing the latest possible version in the combined set
# for pip 21.xx additional compatibility check is done
pip install --extra-index-url https://123.com

# view installation logs
TMPDIR=./tmp pip install --no-clean

# fresh install without using any cache(e.g., built/downloaded wheels)
pip install --no-cache-dir

# fresh install without using any binary => build binary locally
pip install --no-binary :all:

"""
When making build requirements available, pip does so in an isolated environment.
That is, pip does not install those requirements into the user’s site-packages, but
rather installs them in a temporary directory which it adds to the user’s sys.path
for the duration of the build. This ensures that build requirements are handled
independently of the user’s runtime environment. For example, a project that needs
a recent version of setuptools to build can still be installed, even if the user has
 an older version installed (and without silently replacing that version).

n certain cases, projects (or redistributors) may have workflows that explicitly
manage the build environment. For such workflows, build isolation can be
problematic. If this is the case, pip provides a --no-build-isolation
flag to disable build isolation. Users supplying this flag are responsible
for ensuring the build environment is managed appropriately (including ensuring
that all required build dependencies are installed).

build requirements are specifed in pyproject.toml(EP 518), e.g.,
[build-system]
requires = [
    "setuptools",
    "cython>=0.25,<3.0",
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
    "thinc>=8.0.12,<8.1.0",
    "blis>=0.4.0,<0.8.0",
    "pathy",
    "numpy>=1.15.0",
]
build-backend = "setuptools.build_meta"
"""
pip install --no-build-isolation

pipdeptree -fl  # examine depency tree

pip --dry-run

# yaml

ruamel is derivative of PyYAML and not actively developed. PyYAML supports the YAML 1.1 standard, ruamel supports YAML 1.2 as released in 2009.

PyYAML YAML 1.2 status: https://github.com/yaml/pyyaml/issues/116

# Disable alias

import ruamel.yaml

class NonAliasingRTRepresenter(ruamel.yaml.representer.RoundTripRepresenter):
    def ignore_aliases(self, data):
        return True

yaml = ruamel.yaml.YAML()
yaml.Representer = NonAliasingRTRepresenter

yaml.dump(data, sys.stdout)

# Partial flow

YAML calls the indentation style “block” and the JSON style “flow”. Flow style can be used at any point within the block style.

import ruamel.yaml

def L(*l):
   ret = ruamel.yaml.comments.CommentedSeq(l)
   ret.fa.set_flow_style()
   return ret

yaml = ruamel.yaml.YAML()
data = {}
data['users'] = L()
data['users'].append('user2 key1')
data['users'].append('user2 key2')
data['users2'] = L('abc', L('user2 group1', 'user2 group2'))
yaml.dump(data, sys.stdout)

"""
users: [user2 key1, user2 key2]
users2: [abc, [user2 group1, user2 group2]]
"""

# Multiline

http://yaml-multiline.info/

# Miscellaneous

strip

"[1,2,3,4]".strip("[1,2,]") # "3,4", any combination

find
```
"123".find('1') # 0
```

raw string

r"\x" # "\\x"
r'\' # SyntaxError
r"\\" # "\\\\"
"\s"  # "\\s"
r"\s"  # "\\s"
"\f"  # \x0c
r"\f"  # \\f

pathlib

pathlib.Path.cwd() # current work directory
pathlib.Path(__file__).resolve() # absolute path of current file
p = Path("/home/user/Downloads/repo/test.txt")
p.stem # test name without extension
p.name # test.txt full name
json.loads(pathlib.Path('123.json').read_text())

os.getcwd()
os.path.dirname(os.path.realpath(__file__)) # directory
os.path.split(os.path.realpath(__file__)) # directory and filename

pathlib.Path('123/456/789/12.txt').parents[0] = pathlib.Path('123/456/789/12.txt').parent
pathlib.Path('123/456/789/12.txt').parents[1] = pathlib.Path('123/456/789/12.txt').parent.parent

return of Path is a special class
it can be used with /
pathlib.Path.cwd() / "dir"

shutil

shutil.copyfileobj(f, sys.stdout) # similar like cat
shutil.move(f, target) # target must not exist, and target can in different disk
os.rename # target must not exist, and target in the same disk
os.replace # target can exist, and target in the same disk

Path(f).rename(target) # Unix: target can exist. Target can in different disk. Similar like mv command.
Path(f).replace # target can exist in any os. Target can in different disk.

dict

{"a":10}.get(keyname, valueifkeynotexist)
{"a": 10}.pop("b")  # error
{"a": 10}.pop("b", None)  # ok

encoding

utf-8 excel contains dummy string, so using utf_8_sig

open() 's encoding is decided by environment's locale and can be check by python -c 'import locale; print(locale.getpreferredencoding())', can be set by export LC_ALL=en_US.UTF-8 export LANG=en_US.UTF-8

python 3.6, invalid_country.UTF-8 (this country not exsit in locale -a) leads preferredencoding to be ascii; python 3.7, this problem does not exist

sys.stdout.encoding's encoding is decided by environment's locale and can be set by PYTHONIOENCODING=UTF-8

b'123' -> '123':
b'123'.decode('utf-8')
string decomposition

Takes ⑴, ①, è and 프 for consideration:
- unicodedata.normalize('NFD', text), canonical equivalence: è or 프
- unicodedata.normalize('NFKD', text), compatibility equivalence: ⑴, ①, è and 프
- unicodedata.decomposition(text), special format: ⑴, ①, è
unicodedata.category(char) can be used to check if a character is unwanted accent mark
doc
- dir(object): check all attributes
- object.attr.__doc__: check how to use
- ?object.attr in jupyter to check how to use
- inspect.getsource(object.method): check source code

yield

return a generator to save space and some code

def countdown_gen(x):
    count = x
    while count > 0:
        yield count
        count -= 1

g = countdown_gen(5)

for item in g:
    print(item) # 5, 4, 3, 2, 1

assign a value back to yield

def getPrimes(number):
    while True:
        if isPrime(number):
            number = yield number # generator returns number, and assign value sent to number
        number += 1

gen = getPrimes(10)
gen.send(None) # first sent must be None
# next(gen) => gen.send(None)
# 11
# execution stuck at yield


gen.send(10) # number becomes 10 and starts another round iteration
# 11
# execution stuck at yield

def translator():
    # load all the words in English language and the translation to 'other lang'
    my_words_dict = {'hello': 'hello in other language', 'dog': 'dog in other language'}

    while True:
        word = yield
        yield my_words_dict.get(word, 'Unknown word...')

my_words_translator = translator()

next(my_words_translator) # stuck in first yield
print(my_words_translator.send('dog')) # assign value to word and stuck in send yield

next(my_words_translator) # stuck in first yield again
print(my_words_translator.send('cat'))

throw exception

def add_to_database(connection_string):
    db = mydatabaselibrary.connect(connection_string)
    cursor = db.cursor()
    try:
        while True:
            try:
                row = yield
                cursor.execute('INSERT INTO mytable VALUES(?, ?, ?)', row)
            except CommitException:
                cursor.execute('COMMIT')
            except AbortException:
                cursor.execute('ABORT')
    finally:
        cursor.execute('ABORT')
        db.close()

db = add_to_database('bla')
db.send(None) # opens a database connection
db.send('bla') # insert a row
db.throw(CommitException) # abort the transaction

yield from: transparent two way channel between the caller and the sub-generator, more info, but one simple usage is generating values from a iterator

def flatten(sequence):
"""flatten a multi level list or something
>>> list(flatten([1, [2], 3]))
[1, 2, 3]
>>> list(flatten([1, [2], [3, [4]]]))
[1, 2, 3, 4]
"""
for element in sequence:
    if hasattr(element, '__iter__'):
        yield from flatten(element)
    else:
        yield element

naming variable, package: identifier ::= (letter|"_") (letter | digit | "_")*, if breaking rules, interpreter error occurs
Invalid Import
```
from importlib import import_module
__import__('module-name')
```
declaration hoisting

There is no hoisting in python

lambda

lambda *x: print(x)  # supports var_args

json

json.dump(ensure_ascii=True): saves 我 into \u6211, which can be read in any encoding as \u6211
json.loads(): handles \u6211 as 我

json.dump({"text": "我"}, open('123.json', 'w'), ensure_ascii=True)
with open('123.json', encoding='latin_1') as reader:  # any encoding
    read_text = reader.read()
    json_text = json.loads(read_text)
    print(read_text)  # \u6211
    print(json_text)  # 我

json.dump({"text": "我"}, open('123.json', 'w'), ensure_ascii=False)
with open('123.json', encoding='latin_1') as reader:  # encoding error
    read_text = reader.read()
    json_text = json.loads(read_text)
    print(read_text)  # {"text": "æ"}
    print(json_text)  # {'text': 'æ\x88\x91'}

Take care control characters while json.loads()

json.loads('{"apple": "good\nfruit"}', strict=False)
json.loads(r'{"apple": "good\nfruit"}')

More examples

json.loads('["\\u00b9"]')
json.loads('["¹"]')

split

If sep is not specified or is None, a different splitting algorithm is applied: runs of consecutive whitespace are regarded as a single separator, and the result will contain no empty strings at the start or end if the string has leading or trailing whitespace. Consequently, splitting an empty string or a string consisting of just whitespace with a None separator returns [].
```
' a b c '.split()  # ['a', 'b', 'c']
' a b c '.split(' ')  # ['', 'a', 'b', 'c', '']
```

variable scope

def kk(value):
    def gg():
        print(value)
    gg()
    value = 10
    gg()

kk(3)
# 3
# 10

Web →