# Basics
Quick reference to syntax usage. Please focus on system design, fundamental design of the language itself and algorithm.
# Loop
range
range(10) # 0 to 9 reversed(range(10)) # 9 to 0 for i in range(10): pass i == 9 # True
zip
Comparison between python 2 and 3
python3
: zip returns an iterator.
python2
: zip returns a list.zip([1,2,3,4]) # iterator of [(1,),(2,),(3,),(4,)] list(zip([1,2,3,4])) # [(1,),(2,),(3,),(4,)] zip([1,2,3], [4,5,6]) # [(1,4),(2,5),(3,6)] zip([1,2,3], [1,2,3,4,5,6,7]) # [(1,1), (2,2,), (3,3)] itertools.zip_longest([1,2], [4,5,6], fillvalue=None) # [(1,4),(2,5),(None, 6)]
# String
format
%
# number format "%.2f%%" % (100 * 1 / 3) # 33.33%, %% to escape # positive decimal means right alignment for both string and numbers "%6.2f" % (33.33) # " 33.33" right align "%-6.2f" % (33.33) # "33.33 " left align "%x" % (8217) # 2019 decimal to hex ("%%%ds" % length) % string # indention as parameter "%*s" % (length, string) # indention as parameter # dict format to_print = {'name': 'david', 'age': 16} "%(name)s, %(age)d" % (to_print)
format
"{0}".format(0.33) # "0.33" "{0:.0%}".format(0.33) # "33%" "{0:.2f}".format(0.33) # "0.33" "{0:6.2f}".format(0.33) # " 0.33" "{0:>6.2f}".format(0.33) # " 0.33" "{0:<6.2f}".format(0.33) # "0.33 " "{:6.2f}".format(0.33) # " 0.33" "{:,.2f}".format(1234) # "1,234.00" # string default left, numeric default right "{:8s}".format("guagua") # "guagua " "{:>8s}".format("guagua") # " guagua" '{1:>{0}}'.format(length, string) # indention as parameter "{value}".format(value=10) person = {"name": "guagua", "age": 10} "{name}, {age}".format(**person) "I " + "love " + "you"
f
TIP
f-str is evaluated in runtime, so it is faster
\
is not allowedname = "guagua" age = 10 f"{name},{age:.2f}" # guagua,10.00 # positive decimal not for justification f"{name},{age:6f}" # guagua,10.000000 # positive decimal means right justified f"{name},{age:6.2f}" # guagua, 10.00 # > means right justified (default for numeric value) f"{name},{age:>6.2f}" # guagua, 10.00 # < means left justified f"{name},{age:<6.2f}" # guagua,10.00 # > means left justified (default for string) f"start:{name:>10s},{age:6.2f}" # start: guagua, 10.00 # - not working for left justification f"{name},{age:-6d}" # guagua, 10 f"{age:{length}}" # pass indention as variable f"{name},{1234:,.2f}" # guagua,1,234.00 f"{func(arg)}" # evaluation f"{{{age}}}" # {10} f"{ {age} }" # {10} class Test: def __init__(self): self.a = "111" pass def __str__(self): return f"{self.a}" def __repr__(self): return f"{self.a} hahah" test = Test f"{test}" # 111 f"{test!r}" # 111 hahah (f"I " f"love " f"you") # I love you, no + sign needed f"I " \ f"love " \ f"you"
arabic
Class of bi-direction defines the display direction of a character.
TIP
default bidi text base direction in python and js engine are left-to-right. In pycharm, it is content-based. vscode's terminal is problematic. The arabic string is treated as normal string.
left-to-right base direction:
""" 1. arabic words display from right to left 2. alphabets display from left to right 3. base direction controls how words are connected 3. in left-to-right base direction, alphabets put at existing text's right, even if the text is arabic 4. arabic word[number] put at right if existing is non-arabic, otherwise at left 5. \u200e, left to right mark, changes arabic[number] to be put at right 6. \u200f, right to left mart, changes arabic[number] to be put at left 7. \u202e + text + \u202c => txet 8. \u202e + arabic + \u202c => arabic """ ar = 'للصناعة' # existing is none, 123 as it is, 123|cursor--> # existing is 123 with ltr, arabic word put at right # ar[0] = 'ل' # 123|cursor-->'للصناعة' '123' + ar => '123للصناعة' 'abc' + ar => 'abcللصناعة' # ar[0] = 'ل' # ar: <--curser|'للصناعة' # existing is arabic, numbers[arabic] put at left # alphabets[punctuation] are always put at right # numbers: cursor-->|'للصناعة' # alphabets: 'للصناعة'|cursor--> ar + '123' => 'للصناعة123' ar + 'abc' => 'للصناعةabc' ar + '\n' => 'للصناعة\n' '123' + ar + '456' => '123للصناعة456' # \u200e changes numbers to be put at right # numbers: 'للصناعة'|cursor--> '123' + ar + '\u200E' + '456' => '123للصناعة456'
Encoding
# 'x': decimal to hex '%04x' % ord('我') => 6211 chr(25105) => '我' '我'.encode('utf-8') => b'\xe6\x88\x91' # \xef\xbb\xbf byte order mark to indicate utf-8 encoding schema '我'.encode('utf-8-sig') => b'\xef\xbb\xbf\xe6\x88\x91' """ \xfe\xff or \xff\xfe is byte order mark to indicate endian and encoding schema if reads \xff\xfe, then python knows utf-16 is using little endian if reads \xfe\xff, then python knows utf-16 is using big endian in this case, utf-16 is using little endian \u6211 in little endian: \x11\x62; bytes \x62 in ascii is b """ '我'.encode('utf-16') => b'\xff\xfe\x11b' '我'.encode('utf-16le') => b'\x11b' '我'.encode('utf-16be') => b'b\x11'
Raw string
raw strings are not 100% raw
r'\' # error
r'\n' # \n
r'\"' # \"
r'''123'''' # error
r'''12'3''' # 12'3
- Common method
capitalize first letter: st.title()
# Syntax sugar
if else
x = 10 if a > b else 9 # ternary operator(nested conditional assignment) sign = "positive" if num > 0 else "negative" if num < 0 else "zero" lambda x: True if x % 2 == 0 else False [x for x in y if x>0] [x if x>10 else 5 for x in y]
tuple
l = 10, # initialized as (10,)
raw string
l = r'\' # single slash cannot be rawed l = '\\' # works l = r'\s' # \s l = '\\s' # \s
&
and|
{1,2,3} & {3} => {3} {1,2,3} | {4} => {1,2,3,4} collections.Counter([1,2,3,3]) & collections.Counter([3,2,2]) => Counter({2: 1, 3: 1}) collections.Counter([1,2,3,3]) | collections.Counter([3,2,2]) => Counter({1: 1, 2: 2, 3: 2})
sort by two keys
sorted(a_list, lambda x: (key1, key2))
list concatenation
l = [] l[:0] = [1, 2, 3] # [1, 2, 3]
one liner
stmt: simple_stmt | compound_stmt simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
;
to separate non-compound statements on a single line; the grammar makes no allowance for a non-compound statement and a compound statement separated by a semicolon.example with compound for statement:
python -c $'x=10\nfor i in range(x): print(i)' # use \n to separate non-compound and compound
in
def __contains__(self, needle): for elem in self: if needle is elem or needle == elem: return True return False
# Error handling
TIP
If the code in the except block raises an unhandled exception, the before-caught exception will be thrown again with the message format:
original_exception
During handling of the above exception, another exception occurred
new_exception
try:
code()
except Exception as e:
code_throw_exception()
# Regex
re.sub
re.sub(r"[^a-b]", ' ', "mystring") re.sub('\n', '', '123\n456') # 123456 re.sub(r'\n', '', '123\n456') # 123456, \n raw string is treated as new line in regex re.sub('\\n', '', '123\n456') # 123456, \n raw string is treated as new line in regex # returns '-a-b--d-' # Empty matches for the pattern are replaced when adjacent to a previous non-empty match # five matches due to * |a|bx|d| re.sub('x*', '-', 'abxd') re.sub('.*', '123', 'abc') # abc| => 123123
re.findall
re.findall("dss", "gddss") # ["dss"], a list re.findall("dss", "gdssdss") # ["dss", "dss"], a list re.findall("d(ss)", "gddss") # ["ss"], a list re.findall("(12)|(34)", "123445") # [('12', '34')] re.findall("(12)|(34)", "123445") # [('12', ''), ('', '34')] re.search("d(ss)", "gddss") # a match group, where 0 is the full match(dss), group 1 is ss. re.search("dss", "gdssdss") # only has group 0(dss) re.match("dss", "gddss") # None, re.match("dss") <=> re.search("^dss") avoiding using re.match
re.MULTILINE
if
^
in pattern, search every line
invalid forre.match
look
Order
- behind -> ahead
- (?<=abc)123: first find 123 then behind-looking abc
Trivial pattern
something(?<=something)
and(?=something)something
are trivial:123(?<=123)abc
,123(?=abc)abc
(123abc)456(?<=123)abc
(no matched string exists)re.search("^(?<=abc)123")
orre.match("(?<=abc)123")
always matches nothing.
- If look ahead has ahead, it must be look head's substring(or
.
). If look behind has behind, it must be look behind's substring.
Flexibility
(?=something)
and(?!something)
are flexible:something
can be any regex pattern- lookahead supports or operation, but lookbehind doesn't(fix width jumping back to save speed)
^((?!123).)*$
: from start to end, as long as there is no 123 inside. (negative lookahead has flexible ahead)^((?!123|456).)*$
: from start to end, as long as there is no 123 nor 456 inside. (negative lookahead has flexible ahead)(?:(?<=123)|(?<=456))
123 or 456(?<!123)(?<!456)
not 123 and 456
\b
\b
will only match if on one side there is a valid "word" character and on the other side not.)\b
not matching) word
, as)
's two sides are all non-worda\b
matchinga word
fuzzy match (from
regex
notre
)Regex usually attempts an exact match, but sometimes an approximate, or “fuzzy”, match is needed, for those cases where the text being searched may contain errors in the form of inserted, deleted or substituted characters.
A fuzzy regex specifies which types of errors are permitted, and, optionally, either the minimum and maximum or only the maximum permitted number of each type. (You cannot specify only a minimum.)
The 3 types of error are:
- Insertion, indicated by “i”
- Deletion, indicated by “d”
- Substitution, indicated by “s”
In addition, “e” indicates any type of error.
The fuzziness of a regex item is specified between “{” and “}” after the item.
Examples:
foo
match “foo” exactly(?:foo){i}
match “foo”, permitting insertions(?:foo){d}
match “foo”, permitting deletions(?:foo){s}
match “foo”, permitting substitutions(?:foo){i,s}
match “foo”, permitting insertions and substitutions(?:foo){e}
match “foo”, permitting errors
If a certain type of error is specified, then any type not specified will not be permitted.
In the following examples I’ll omit the item and write only the fuzziness:
{d<=3}
permit at most 3 deletions, but no other types{i<=1,s<=2}
permit at most 1 insertion and at most 2 substitutions, but no deletions{1<=e<=3}
permit at least 1 and at most 3 errors{i<=2,d<=2,e<=3}
permit at most 2 insertions, at most 2 deletions, at most 3 errors in total, but no substitutions
It’s also possible to state the costs of each type of error and the maximum permitted total cost.
Examples:
{2i+2d+1s<=4}
each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4{i<=1,d<=1,s<=1,2i+2d+1s<=4}
at most 1 insertion, at most 1 deletion, at most 1 substitution; each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4
You can also use “<” instead of “<=” if you want an exclusive minimum or maximum.
You can add a test to perform on a character that’s substituted or inserted.
Examples:
{s<=2:[a-z]}
at most 2 substitutions, which must be in the character set[a-z]
.{s<=2,i<=3:\d}
at most 2 substitutions, at most 3 insertions, which must be digits.
By default, fuzzy matching searches for the first match that meets the given constraints. The
ENHANCEMATCH
flag will cause it to attempt to improve the fit (i.e. reduce the number of errors) of the match that it has found.The
BESTMATCH
flag will make it search for the best match instead.Further examples to note:
regex.search("(dog){e}", "cat and dog")[1]
returns"cat"
because that matches"dog"
with 3 errors (an unlimited number of errors is permitted).regex.search("(dog){e<=1}", "cat and dog")[1]
returns" dog"
(with a leading space) because that matches"dog"
with 1 error, which is within the limit.regex.search("(?e)(dog){e<=1}", "cat and dog")[1]
returns"dog"
(without a leading space) because the fuzzy search matches" dog"
with 1 error, which is within the limit, and the(?e)
then it attempts a better fit.
In the first two examples there are perfect matches later in the string, but in neither case is it the first possible match.
The match object has an attribute fuzzy_counts which gives the total number of substitutions, insertions and deletions.
>>> # A 'raw' fuzzy match: >>> regex.fullmatch(r"(?:cats|cat){e<=1}", "cat").fuzzy_counts (0, 0, 1) >>> # 0 substitutions, 0 insertions, 1 deletion.
>>> # A better match might be possible if the ENHANCEMATCH flag used: >>> regex.fullmatch(r"(?e)(?:cats|cat){e<=1}", "cat").fuzzy_counts (0, 0, 0) >>> # 0 substitutions, 0 insertions, 0 deletions.
The match object also has an attribute
fuzzy_changes
which gives a tuple of the positions of the substitutions, insertions and deletions.>>> m = regex.search('(fuu){i<=2,d<=2,e<=5}', 'anaconda foo bar') >>> m <regex.Match object; span=(7, 10), match='a f', fuzzy_counts=(0, 2, 2)> >>> m.fuzzy_changes ([], [7, 8], [10, 11])
What this means is that if the matched part of the string had been:
'anacondfuuoo bar'
it would’ve been an exact match.
However, there were insertions at positions 7 and 8:
'anaconda fuuoo bar' ^^
and deletions at positions 10 and 11:
'anaconda f~~oo bar' ^^
So the actual string was:
'anaconda foo bar'
-
re.search(r'^(\W*K\W*)+$', 'K K K K K K K K 6') # leads to catastrophic, because 1st \W* & 2nd \W* creates a lot of combinations re.search(r'^(\W*K)+\W*$', 'K K K K K K K K 6') # no, because \W*K with little combinations
# Arithmetic
**
, powerval // num
=> floor-(-val // num)
=> ceil449027587.27 - 315000000 != 134027587.27
from decimal import Decimal Decimal(449027587.27) - Decimal(315000000) == Decimal(134027587.27)
**
>~x
>+x -x
>* , / , // , %
>+ -
><<, >>
>&
>^
>|
>!=, ==
# 进制
- hex <=> decimal
hex(12) # 0xc format(12, 'x') # c(without '0x' proceeding) int("0xc", 16) # 12
- bin <=> decimal
bin(12) # 0b1100 format(12, 'b') # 1100 int("0b1100", 2) # 12
# List
# Basics
l1 = [1,2,3]
l1.extend([4,5,6])
l1.append(7)
[8] + l1
l1[0::2]
l1[::-1]
a = [1,2,3]
b = [4,5,6]
c = [*a, *b] # [1,2,3,4,5,6]
mixed_types = [1,2,3, 'abc']
# pydash
pydash.key_by
andpydash.group_by
keep the references to original itempydash.key_by
andpydash.group_by
'siteratee
evaluated to property if it is.
delimited(True/False, if list with more than one item)pydash.get
accepts path in both.
and list format
# Logging
levels
- CRITICAL
- ERROR
- WARNING
- INFO
- DEBUG
- NOTSET
mechanism
import logging log = logging.getLogger(__name__) log.info("Hello, world")
the message is turned into a
LogRecord
object and routed to aHandler
object registered for this logger. The handler will then use aFormatter
to turn the LogRecord into a string and emit that string.lazy evaluation
logger.debug('this is a debug message %s', var) # lazy evaluation(only if debug level) # always evaluation logger.debug(f'this is a debug message {var}') logger.debug('this is a debug message %s' % var)
usage
basicConfig
: initializes logging and adds aFormatter
and aHandler
to the root logger, if not exists. (logging.info
will callsbasicConfig
internally if no handler available)import logging import os # exports to stderr # ERROR:the.module.name:The log message logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
OneLineExceptionFormatter
import logging import os class OneLineExceptionFormatter(logging.Formatter): def formatException(self, exc_info): result = super().formatException(exc_info) return repr(result) def format(self, record): result = super().format(record) if record.exc_text: result = result.replace("\n", "") return result # default level for handler is NOTSET handler = logging.StreamHandler() # BASIC_FORMAT = "%(levelname)s:%(name)s:%(message)s" formatter = OneLineExceptionFormatter(logging.BASIC_FORMAT) handler.setFormatter(formatter) # default level for logger is NOTSET # no name given, returns root logger root = logging.getLogger() root.setLevel(os.environ.get("LOGLEVEL", "INFO")) root.addHandler(handler) # global try/catch try: exit(main()) except Exception: # logging.exception == logging.error('', exc_info=True) == logging.error('', exc_info=False) + 'original exception stack trace, during handling another happen:' + print(traceback.format_exc()) # exc_info contains the stack trace # if there is no other exception handler, output the # stacktrace is okay. Otherwise, log the error with # summary only, i.e., # logging.execption(exc_True=False) logging.exception("Exception in main(): ") exit(1)
ColoredLogger
import logging class CustomFormatter(logging.Formatter): """Logging Formatter to add colors and count warning / errors""" grey = "\x1b[38;21m" yellow = "\x1b[33;21m" red = "\x1b[31;21m" bold_red = "\x1b[31;1m" reset = "\x1b[0m" format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)" FORMATS = { logging.DEBUG: grey + format + reset, logging.INFO: grey + format + reset, logging.WARNING: yellow + format + reset, logging.ERROR: red + format + reset, logging.CRITICAL: bold_red + format + reset } def format(self, record): log_fmt = self.FORMATS.get(record.levelno) formatter = logging.Formatter(log_fmt) return formatter.format(record) # default level for handler is NOTSET handler = logging.StreamHandler() formatter = CustomFormatter() handler.setFormatter(formatter) # default level for logger is NOTSET # no name given, returns root logger root = logging.getLogger() root.setLevel(os.environ.get("LOGLEVEL", "INFO")) root.addHandler(handler) # global try/catch try: exit(main()) except Exception: # logging.exception == logging.error('', exc_info=True) # exc_info contains the stack trace logging.exception("Exception in main(): ") exit(1)
Log to file
import logging import logging.handlers import os handler = logging.handlers.WatchedFileHandler( os.environ.get("LOGFILE", "/var/log/yourapp.log")) formatter = logging.Formatter(logging.BASIC_FORMAT) handler.setFormatter(formatter) root = logging.getLogger() root.setLevel(os.environ.get("LOGLEVEL", "INFO")) root.addHandler(handler) # global try/catch try: exit(main()) except Exception: logging.exception("Exception in main()") exit(1)
load the logging configuration from a configuration file
version: 1 disable_existing_loggers: true formatters: simple: format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' handlers: console: class: logging.StreamHandler level: INFO formatter: simple stream: ext://sys.stdout file: class: logging.FileHandler level: DEBUG filename: logs/dbInteract.log loggers: # They override the existing ones # if disable_existing_loggers, the non-defined loggers won't work simpleExample: level: DEBUG handlers: [console] propagate: no root: level: DEBUG # loger level should be above/equal handler's level handlers: [console,file]
import logging.config import yaml with open('./test.yml', 'r') as stream: config = yaml.load(stream, Loader=yaml.FullLoader) logging.config.dictConfig(config)
remarks
- When deploying to containers, try to keep things as simple as possible. Log to standard out/err and rely on your container host or orchestration platform to handle figuring out what to do with the logs
- exporting to stderr, pycharm will mark the messages as red
gunicorn
adds additional handlers whereastensorflow
will not if a root logger already exists- set up the logging sys after necessary packages are imported to take control of them
- to disable a specific package:
logging.getLogger('name').disabled = True
logging.info
uses root loggerlogging.getLogger()
gets root logger
# import
Background
Module execution via import statement (i.e., import <modulename>
):
sys.path
is not modified in any way__name__
is set to the absolute form of<modulename>
__package__
is set to the immediate parent package in<modulename>
__init__.py
is evaluated for all packages (including its own for package modules)__main__.py
is not evaluated for package modules; the code is evaluated for code modules
Module execution via command line with filename (i.e., python <filename>
):
sys.path
is modified to include the final directory in<filename>
__name__
is set to__main__
__package__
is set toNone
__init__.py
is not evaluated for any package (including its own for package modules)__main__.py
is evaluated for package modules; the code is evaluated for code modules
Module execution via command line with modulename (i.e., python -m <modulename>
):
sys.path
is modified to include the current directory__name__
is set to__main__
__package__
is set to the immediate parent package in<modulename>
__init__.py
is evaluated for all packages (including its own for package modules)__main__.py
is evaluated for package modules; the code is evaluated for code modules
import steps
""" project structure /usr/test(root): - app.py # __package__ is '' - another_app.py # __package__ is '' - moduleA(pakcage): - class_a.py # __package__ is moduleA - class_b.py # __package__ is moduleA """
- resolve relative steps: if there is
from .xx import yy
orfrom ..xx import yy
etc, using__package__
to resolve.__package__
might be None, empty string or the package name. Different modules(files) can have different__package__
in the same python run. Modules at root folder have empty string as__package__
, and other moduels have their package residing in as__package__
. Note that if python program is run as a script(without -m),__package__
becomes None, so relative imports will not work.""" app.py """ print(f'__package__ is {__package__}') """ another_app.py """ import app """ class_a.py """ from .. import app
# case 1 # If a module(file) is at root, or top-level, __package__ is empty string (/user/test) $ python -m another_app >>> __package__ is '' # case 2 # when a top-level module is run as the entry point as with -m, __package__ is empty string (/usr/test) $ python -m app >>> __package__ is '' # case 3 # number of dots cannot exceed number of packages in __package__ # class_a has 1 level package: moduleA, so .. does not work (/usr/test) $ python -m moduleA.class_a >>> ValueError: attempted relative import beyond top-level package # to make it work, switch root to /usr and make __package__ for class_a.py as test.moduleA (/usr) $ python -m test.moduelA.class_a >>> __package__ is test # case 4 # if run the program as scripts, i.e., without -m, __package__ is None (/usr/test) $ python app.py >>> __package__ is None
- search from
sys.path
""" class_a.py """ import sys print(f'sys.path is {sys.path[0]}')
# the directory containing the script, i.e., /usr/test/moduleA (/usr/test) $ python moduleA/class_a.py >>> sys.path is /usr/test/moduleA # current directory, i.e., /usr/test (/usr/test) $ python -m moduelA.class_a >>> sys.path[0] is /usr/test # interactive shell (/usr/test) $ python ~~~ import sys ~~~ sys.path[0] >>> ''
- resolve relative steps: if there is
__init__.py
WARNING
Since python3.3, an empty
__init__.py
is no longer needed to mark a folder as python package# moduleA's __init__.py is run from app.moduleA import method import app.moduleA # moduleA's __init__.py is not run import app.moduleA.method
Reloading
After code changes, running code is outdated, but error code referenced in stack trace is updated
import importlib # only module can be reloaded importlib.reload(com.package.module) from com.package.module import method
Variable control
__all__
variable controls what can be imported byfrom x import *
Miscellaneous
from A.B.C import D from B.C import D # suppose A in added in PYTHONPATH """ Modify variables inside 1st D will not change those inside 2nd D, since __package__ of two Ds are different """
import A A.B # error as sub module cannot be accessed if not imported import A.B # okay
# OOP
is
used to check if two variables refer to the same objectself
has access to class variables, but read-only:- recommends to get access to class methods using
self
for simplicity - recommends to get access to class variables using
type(self)
instead ofself.__class__
Comparison `type(self)` with `self.__class__`
type()
is the same as__class__
in python3
in python2, if a class has no inheritance, only__class__
works- recommends to get access to class methods using
child has access to parent's overridden method
class Foo(Bar): def __init__(arg): super(Foo, self).__init__(arg) def baz(self, arg): # method implementation is from parent, but self refers to child return super(Foo, self).baz(arg)
child has access to grandparent's overridden method
class Bar(Cha): def __init__(arg): super(Bar, self).__init__(arg) class Foo(Bar): def __init__(arg): super(Foo, self).__init__(arg) def chz(self, arg): # method implementation is from grandparent, but self refers to child # note the 1st argument of super is Bar instead of Foo return super(Bar, self).chz(arg)
type
type of a class is type if no custom metaclass is given
class PythonBlog: pass class PythonBlogSon(PythonBlog): pass type(PythonBlog) # type type(PythonBlogSon) # type class PythonBlog(metaclass=MyMeta) pass class PythonBlogSon(PythonBlog): pass type(PythonBLog) # MyMeta type(PythonBlogSon) # MyMeta
__new__
rarely used
class Eel(object): MAX_EELS = 20 n_eels = 0 # memory allocation and static variable manipulation # called to create an instance def __new__(cls, *args, **kwargs): if cls.n_eels == cls.MAX_EELS: raise HovercraftFull() obj = super(Eel, cls).__new__(cls) cls.n_eels += 1 return obj
__call__
class Foo: def __init__(self, a, b, c): pass class Goo: def __call__(self, a, b, c): pass Foo(1, 2, 3) # __init__ is called goo = Goo() goo(1, 2, 3) # __call__ is called class Singleton(type): _instances = {} def __call__(cls, *args, **kwargs): if cls not in cls._instances: cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) return cls._instances[cls] class MyClass(metaclass=Singleton): pass
__dict__
stores class attributes for class and instance attributes for instance
class Test: def __init__(self): self.a = 10 def execute(self): return 10 """ mappingproxy({'__module__': '__main__', '__init__': <function __main__.Test.__init__(self)>, 'execute': <function __main__.Test.execute(self)>, '__dict__': <attribute '__dict__' of 'Test' objects>, '__weakref__': <attribute '__weakref__' of 'Test' objects>, '__doc__': None}) """ Test.__dict__ Test().__dict__ # {'a': 10}
__wrapped__
get access to wrapped underlying function
import functools class Memoize: def __init__(self, func): self.func = func self.memo = dict() # update __module__, __name__, __qualname__, __annotations__ and __doc__ # no update on __dict__ functools.update_wrapper(self, func, updated=[]) def __call__(self, *args): if args not in self.memo: self.memo[args] = self.func(args) else: print("cls decorator. You have printed this before") return self.memo[args] class CallCounter: def __init__(self, func): self.func = func self.calls = 0 # update __module__, __name__, __qualname__, __annotations__ and __doc__ # update __dict__, i.e., {**self.__dict__, **wrapped.__dict__} functools.update_wrapper(self, func) def __call__(self, *args, **kwargs): self.calls += 1 return self.func(*args, **kwargs) @Memoize @CallCounter def doubleprint(x): for elem in x: print(elem + " " + elem) doubleprint # Memoize at 0x7fa8034edcc0 doubleprint.__wrapped__ # CallCounter at 0x7fa8019540b8 doubleprint.__wrapped__.__wrapped__ # function itself
__eq__
and__hash__
class Number: def __init__(self, number): self.number = number def __eq__(self, other): """Overrides the default implementation""" if isinstance(other, Number): return self.number == other.number # swap to other's __eq__ return NotImplemented # must return int def __hash__(self): """Overrides the default implementation""" return hash(tuple(sorted(self.__dict__.items()))) n1 = Number(2) o1 = Other('other') # first try n1 then o1's __eq__, if Other is not a subclass of Number # first try 01 then n1's __eq__, if Other is a subclass of Number n1 == o1
staticmethod is inherited
class A: @staticmethod def a(): print('A') class B(A): @staticmethod def b(): print('b') B.a() # A
method resolution order
definition
class Child(Parent): # determines method resolution order def method(self, arg): """ child methods(including __init__ and other methods) follow method resolution order, i.e., mro. Note that mro can be both disconnected and connected: 1. Disconnected mro: if there is no super().method(arg) in a class, method resolution is broken out as soon as this class's method call is finished 2. Connected mro: if for every class excluding object, there is always a super().method(arg), methods in every generation are called """ # super().method(arg) equivalence mro = type(self).mro() for next_class in mro[mro.index(Child) + 1:]: # slice to end if hasattr(next_class, 'method'): next_class.method(self) break
mro examples
# mro example I class Parent(object): pass class ChildA(Parent): pass class ChildB(Parent): pass class Grandchild(ChildA, ChildB): pass Grandchild.__mro__ # mro: (Grandchild, ChildA, ChildB, Parent, object) # mro example II class Parent: def __init__(self, x): self.x = x print("initializing Parent") class ChildA(Parent): pass """ if a class in mro misses the method, that class is skipped """ a = ChildA(10) # skips ChildA, and Parent's __init__ is called a.x # 10 # mro example III class Parent(object): def __init__(self, x): self.x = x print('initializing Parent') def gg(self): print('Parent!') class ChildA(Parent): def __init__(self): print('initializing ChildA') """ mro for both __init__ and gg are both [ChildA, Parent] execution order of __init__: ChildA starts, ChildA ends execution order of gg: ChildA skips (no gg), Parent starts, Parent ends """ a = ChildA() a.gg() # works a.x # error # mro example IV class Parent(object): def __init__(self): print('initializing Parent') class ChildA(Parent): def __init__(self): print('initializing ChildA') super().__init__() class ChildB(Parent): def __init__(self): print('initializing ChildB') super().__init__() class Grandchild(ChildA, ChildB): pass """ connected mro in __init__ method mro is G, A, B, Parent execution order: G skips (as there is no __init__ method) A starts B starts Parent starts Parent ends B ends A ends execution result: initializing ChildA initializing ChildB initializing Parent """ Grandchild() # mro example V class Parent(object): def __init__(self): print('initializing Parent') class ChildA(Parent): def __init__(self): print('initializing ChildA') # Parent.__init__(self) exists and miss super().__init__(), mro of __init__ becomes disconnected Parent.__init__(self) class ChildB(Parent): def __init__(self): print('initializing ChildB') super().__init__() class Grandchild(ChildA, ChildB): pass """ disconnected mro in __init__ method mro is G, A, B, Parent execution order: G skips (as there is no __init__ method) A starts Parent starts Parent ends A ends execution result: initializing ChildA initializing Parent """ Grandchild() # mro example VI class Parent(object): def gg(self): print('Parent!') class ChildA(Parent): pass class ChildB(Parent): def gg(self): print('Child B!') class Grandchild(ChildA, ChildB): pass """ disconnected mro in gg method mro is G, A, B, Parent execution order: G skips (as there is no gg method) A skips (as there is no gg method) B starts B ends execution result: Child B! """ g = Grandchild() g.gg()
Enum
An enumeration is a set of symbolic names (members) bound to unique, constant values
from enum import Enum class TestEnum(Enum): a = 0 b = 1 class Test: a = 0 b = 1 # Enum is iterable list(TestEnum) #[<TestEnum.a: 0>, <TestEnum.b: 1>] list(Test) # error # Enum has value and name TestEnum.a # <enum 'TestEnum'> Test.a # 0 TestEnum.a.value # 0 Test.a.value # error TestEnum.a.name # a Test.a.value # error # Enum is unassign-able TestEnum.a = 10 # error Test.a = 10
Enum values can be functions
import functools from enum import Enum class Wrapper: def __init__(self, f): self.f = f # update __module__, __name__, __qualname__, __annotations__ and __doc__ # update __dict__ functools.update_wrapper(self, f) def __call__(self, *args, **kwargs): return self.c(*args, **kwargs) def __repr__(self): return self.function.__repr__() def fa(): return 'A' class TestEnum(Enum): # make a as an attribute instead of method definition a = Wrapper(fa) def __call__(self, *args, **kwargs): return self.value(*args, **kwargs) TestEnum.a() # 'A'
Enum values can be class
from enum import Enum class CA(): def __init__(self, name): self.name = name def execute(self): return 'A' class TestEnum(Enum): a = CA def __call__(self, *args, **kwargs): return self.value(*args, **kwargs) TestEnum.a().execute() # 'A'
# Pickle
# general
Pickle of spacy.tokenizer.Tokenizer
After loading, Tokenizer
will call token_match
method, hence there should not be any dependency inside token_match
on variables which are restored later than Tokenizer
dump
: might call __getstate__
or __reduce__
load
: might call __new__
or __setstate
Pickable objects after un-pickling are guarenteed to have the same value, but might produce different dump
value from its original dump
: link
# Cannot pickle
import functools
import pickle
def decorate(f):
@functools.wraps(f)
def wrapper(*args, **kwargs):
print(pickle.dumps(f))
return f(*args, **kwargs)
return wrapper
@decorate
def main():
print('A')
if __name__ == '__main__':
"""
Pickle serializes only metadata about the function and then restores it from a code. But at this point hydra's main wrapper already has overwritten main function, so pickled function does not match function that it tries to restore
Traceback (most recent call last):
...
_pickle.PicklingError: Can't pickle <function main at 0x7ff1df907ee0>: it's not the same object as __main__.main
"""
main()
# exec vs eval
exec("", scope)
: always return None. print
or import
takes side effect
eval
: accepts expression only. assignment or statement(like if) are rejected
# Closure
function with environment
Scope
nonlocal
and global
are keywords that control variable scope. Note that a variable created in the main body of the Python code is a global variable and belongs to the global scope.
def generate_counter2():
x = 0
def add_one():
# By making x non-local, Python finds it in the parent context and creates a closure for it again.
nonlocal x
x = x + 1
return x
return add_one
# closure = generate_counter2()
# closure.__closure__ => address 1
# closure()
# closure.__closure__ => address 2
# Yield
def gen():
with some_fn() as manager:
yield something
g = gen()
next(g) # manager has not exited yet
next(g) # goes outside with, and manager got exited
def flatten(sequence):
"""flatten a multi level list or something
>>> list(flatten([1, [2], 3]))
[1, 2, 3]
>>> list(flatten([1, [2], [3, [4]]]))
[1, 2, 3, 4]
"""
for element in sequence:
if hasattr(element, '__iter__'):
yield from flatten(element)
else:
yield element
def writer():
"""A coroutine that writes data *sent* to it to fd, socket, etc."""
while True:
try:
w = (yield)
print('>> ', w)
except StopIteration:
print('Close generator!')
def writer_wrapper(sub_generator)
# it establishes a transparent bidirectional connection between the caller and the sub-generator
yield from sub_generator
w = writer()
next(w)
w.send('data1')
w.send('data2')
w.throw(StopIteration)
w = writer()
w_wrapper = writer_wrapper(w)
next(w_wrapper)
w_wrapper.send('data1')
w_wrapper.send('data2')
w_wrapper.throw(StopIteration)
# Coroutine and Future
Coroutine
There are two types of coroutines. The native coroutine and legacy generator based coroutine. The legacy coroutine is going to be removed in python 3.11
Native coroutine is with async
/await
syntax. It is executed only when it is await
ed
Generator based coroutine is a old way in python 3.5 to create coroutine. It is with the syntac @asyncio.coroutine
/ yield from
. @asyncio.coroutine
enables the generator use yield from
to call native coroutines, and also enables the generator to be called by native coroutines, for instance using an await
expression. There is another decaroter @types.coroutine
does the same thing except a little difference
async def b_sleep():
return 100
@asyncio.coroutine
def a_sleep():
print("doing something in async")
yield from b_sleep() # yield from native coroutine or generator based coroutine
@asyncio.coroutine
def a_sleep():
print("doing something in async")
yield # yield None
await a_sleep()
asyncio
can generate Future
s, which are scheduled to run by event loop regardless of await
.
# Running order
If there is no yield
control to event loop, the running order is the same as await
order. If there is yield
control to event loop, Future
s can be executed immediately.
To yield
control to event loop:
await asyncio.sleep(1)
# yield None
@types.coroutine
def __sleep0():
yield
await __sleep0()
# yield Future
await future
Details1 and Details2[https://github.com/python/cpython/blob/febf54bcf3fdc45ad84b4073e24bbaaee0ac8b2a/Lib/asyncio/tasks.py#L255] on yield
control
import time
async def p(word):
print(f'{time.time()} - {word}')
async def main1():
loop = asyncio.get_event_loop()
coro = p('await')
task2 = loop.create_task(p('create_task2')) # scheduled to next iteration
await coro # run coro only await
await task2 # wait for final result
async def main2():
loop = asyncio.get_event_loop()
coro = p('await')
task2 = loop.create_task(p('create_task2')) # scheduled to next iteration
task3 = loop.create_task(p('create_task3')) # scheduled to next iteration
await asyncio.sleep(1) # loop got control, and runs task2 and task 3
await coro # run coro only await
await task2 # wait for final result
await task3 # wait for final result
# await > task2
await main1()
# task2 > task3 > await
await main2()
# Awaitable Objects
An awaitable object generally implements an __await__()
method. Coroutine objects returned from async def
functions are awaitable. The generator iterator objects returned from generators decorated with types.coroutine()
or asyncio.coroutine()
are also awaitable, but they do not implement __await__()
.
class MyObject:
def __await__(self):
# Must return an iterator
yield from a_future # marks __await__ as an iterator
# Garbage collection
When an object is not referenced by any variable nor other object or becomes unreachable => it becomes garbage and python will make unused memory available for use, but when it gets returned to the OS depends.
To check memory allocation:
np.array([1,2,3]).nbytes ## check memory allocation for numpy
## check memory allocation for any object
def get_obj_size(obj):
marked = {id(obj)}
obj_q = [obj]
sz = 0
while obj_q:
sz += sum(map(sys.getsizeof, obj_q))
# Lookup all the object referred to by the object in obj_q.
# See: https://docs.python.org/3.7/library/gc.html#gc.get_referents
all_refr = ((id(o), o) for o in gc.get_referents(*obj_q))
# Filter object that are already marked.
# Using dict notation will prevent repeated objects.
new_refr = {o_id: o for o_id, o in all_refr if o_id not in marked and not isinstance(o, type)}
# The new obj_q will be the ones that were not marked,
# and we will update marked with their ids so we will
# not traverse them again.
obj_q = new_refr.values()
marked.update(new_refr.keys())
return sz
To clear memory for a dict:
a = {'a': [1, 2, 3], 'b': [4, 5, 6]}
del a['a'] # cut the binding to references; {'b': [4, 5, 6]}
a.pop('a') # x = a['a']; del a['a']; return x
a.clear() # assignment the true value to empty + clear hash set; {}
To check number of references:
a = {'a': [1, 2, 3], 'b': [4, 5, 6]}
import gc
gc.get_referrers(a)
import sys
sys.getrefcount(a) # 2, as number of references plus 1 when `a` is passed to this function
# Subprocess
A child process becomes zombie if it exits or fails after its parent died. Be careful to avoid creating zombie processes:
# 1. use call
subprocess.call(['grep', 'jdoe', '/etc/passwd']) # wait for command to complete
# 2. use communicate
process = Popen(['ls', '-l', '/tmp'], stdout=PIPE, stderr=PIPE)
stdout, stderr = process.communicate() # read data from stdout and stderr, until end-of-file is reached.
try:
stdout, stderr = proc.communicate(timeout=15) # if after 15s subprocess not closed, process is not killed
except TimeoutExpired:
proc.kill()
outs, errs = proc.communicate()
# 3. use wait
process= subprocess.Popen( ('ls', '-l', '/tmp'), stdout=subprocess.PIPE)
process.wait()
# 4. if no wait, make sure you don't keep references to the Popen objects.
# If python script exits, subprocess treats pid 1 as its parent
Popen(["sleep", "30"])
# Concurrent & Parallel
multiprocessing
is designed for parallelismMulti-args Concurrence Blocking Ordered-results apply ✔ X ✔ X apply_async ✔ ✔ X X map X ✔ ✔ ✔ map_async X ✔ X ✔ starmap ✔ ✔ ✔ ✔ starmap_async ✔ ✔ X X apply(func, args, kwds)
: lets the func run in child process but blocks the main process; same asapply_async().get()
;apply_async(func, args, kwds, callback, error_callback)
: schedules the func and returns a async result object; order is multiple tasks is indeterminate
# this function is called in main process def a_back(progress, result, saver): progress.update() log.info('sth') process_result(result) # this function is called in main process def b_back(progress, exception, saver): progress.update() log.info('sth') process_exception(exception) # exit: pool.terminate with mp.Pool(processes=4) as pool: with tqdm(total=some_count) as progress: jobs = [] saver = {} for i in a_iterator: job = pool.apply_async(func, args=(i, ), kwds=a_dict, callback=lambda r: a_back(progress, r, saver), error_callback=lambda e: b_back(progress, e, saver)) jobs.append(job) # to handle exception thrown in child processes for j in jobs: try: j.get() # blocks main process, but other children are running in background except Exception as e: # exception is still thrown even though it might be consumed once by error_callback handle() # alternatively, if no interest in exceptions # pool.close() # pool.join() process_saver(saver)
map(func, [1,2,3,4])
: this method chops the iterable into a number of chunks which it submits to the process pool as separate tasks; blocks the main process; order is guaranteed;map(func, [(1,2), (3,4)])
not workingmap_async(func, [1,2], callback, error_callback)
: order is guaranteed
fork
vsspawn
spawn
isfork
+execve
: Ref
# Time
# Get
from datetime import datetime
import pytz
# gets current time at local time zone
datetime.now()
# gets current time at another time zone
datetime.now(pytz.timezone('America/Chicago'))
# Create
# creates a time with time zone, e.g., 2015-06-11
my_datetime = datetime(2015, 6, 11, 13, 30)
my_tz = pytz.timezone('America/Chicago')
good_dt = my_tz.localize(my_datetime) # replace(tz=my_tz) gives wrong result: https://stackoverflow.com/a/50613134/6845273
# Switch
from dateutil import parser
from dateutil import tz
# switch utc to GMT+08:00
raw_time_str = '2021-11-03T09:53:10.683Z'
from_zone = tz.gettz('UTC')
to_zone = tz.gettz('China/Shanghai') # dateutil.tz is China/Shanghai, pytz is Asia/Shanghai
utc = parser.parse(raw_time_str)
utc = utc.replace(tzinfo=from_zone)
utc.astimezone(to_zone)
# Parser
import dateparser
# parse with custom format, custom locale, beginning year
dateparser.parse(time_string, date_formats=['%Y-%m-%d'], settings={'DATE_ORDER': 'DMY', 'RELATIVE_BASE': datetime(2001, 1, 1),'PARSERS': ['custom-formats']}, locales=['en'])
from dateutil import parser
parser.parse(time_string)
query | dateutil | dateparser |
---|---|---|
2016/09/01 | 1 | 1 |
2016-09-01 | 1 | 1 |
09/01/2016 | 1 | 1 |
09-01-2016 | 1 | 1 |
09012016 | 0 | 0 |
09/01/2016 15:20 | 1 | 1 |
09/01/2016 at 15h20 | 1 | 1 |
15 min ago | 0 | 1 |
two hours ago | 0 | 1 |
a day ago | 0 | 1 |
tuesday | 0 | 1 |
monday at noon | 0 | 1 |
------------------------- | ---------- | ---------- |
total (12) | 6 | 11 |
# Operation
# subtract
from datetime import datetime, timedelta
import pytz
d = datetime.today() - timedelta(days=days_to_subtract)
# create from timestamp
date = datetime.datetime.fromtimestamp(ten_digit_ts, pytz.timezone('Asia/Shanghai'))
# IDE
# Pycharm
dependency
adds local folder to library root [interpreter settings] => adds to interpreter path => adds to
sys.path
python setup.py => adds a soft link in site-packages =>
sys.path
contains site-packagesadds a folder as source root => adds to PYTHONPATH => adds to
sys.path
if
PATH
is edited in run config, its value would automatically benew_value:$PATH
profiler
own time: time spent without childen
- locale
LC_CTYPE
is empty by default in runner, and C
in pycharm termimal
# Conda
channels can be modified at ~/.condarc
some activation magic at #CONDA_PREFIX/etc/conda/activate.d
useful commands:
# refresh a package
conda install --force-reinstall package
# view package dependencies
# or at https://libraries.io/
conda create --dry-run --json -n dummy package
# search multiple channels for packages
conda install -c channel1 -c channel2 package
# view current env path
echo $CONDA_PREFIX
# remove env
conda env remove -n env_name
conda env remove -p env_path
# pip
useful commands
# reinstall a package
pip install --force-reinstall package==version
# generate requirements.txt
pip freeze > requirements.txt
# ignore installed version
pip install -I
# search in pypi first
# then search in extra url
# for pip 20.xx, choosing the latest possible version in the combined set
# for pip 21.xx additional compatibility check is done
pip install --extra-index-url https://123.com
# view installation logs
TMPDIR=./tmp pip install --no-clean
# fresh install without using any cache(e.g., built/downloaded wheels)
pip install --no-cache-dir
# fresh install without using any binary => build binary locally
pip install --no-binary :all:
"""
When making build requirements available, pip does so in an isolated environment.
That is, pip does not install those requirements into the user’s site-packages, but
rather installs them in a temporary directory which it adds to the user’s sys.path
for the duration of the build. This ensures that build requirements are handled
independently of the user’s runtime environment. For example, a project that needs
a recent version of setuptools to build can still be installed, even if the user has
an older version installed (and without silently replacing that version).
n certain cases, projects (or redistributors) may have workflows that explicitly
manage the build environment. For such workflows, build isolation can be
problematic. If this is the case, pip provides a --no-build-isolation
flag to disable build isolation. Users supplying this flag are responsible
for ensuring the build environment is managed appropriately (including ensuring
that all required build dependencies are installed).
build requirements are specifed in pyproject.toml(EP 518), e.g.,
[build-system]
requires = [
"setuptools",
"cython>=0.25,<3.0",
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.12,<8.1.0",
"blis>=0.4.0,<0.8.0",
"pathy",
"numpy>=1.15.0",
]
build-backend = "setuptools.build_meta"
"""
pip install --no-build-isolation
pipdeptree -fl # examine depency tree
pip --dry-run
# yaml
ruamel
is derivative of PyYAML
and not actively developed. PyYAML
supports the YAML 1.1 standard, ruamel
supports YAML 1.2 as released in 2009.
More:
PyYAML
YAML 1.2 status: https://github.com/yaml/pyyaml/issues/116
# Disable alias
import ruamel.yaml
class NonAliasingRTRepresenter(ruamel.yaml.representer.RoundTripRepresenter):
def ignore_aliases(self, data):
return True
yaml = ruamel.yaml.YAML()
yaml.Representer = NonAliasingRTRepresenter
yaml.dump(data, sys.stdout)
# Partial flow
YAML calls the indentation style “block” and the JSON style “flow”. Flow style can be used at any point within the block style.
import ruamel.yaml
def L(*l):
ret = ruamel.yaml.comments.CommentedSeq(l)
ret.fa.set_flow_style()
return ret
yaml = ruamel.yaml.YAML()
data = {}
data['users'] = L()
data['users'].append('user2 key1')
data['users'].append('user2 key2')
data['users2'] = L('abc', L('user2 group1', 'user2 group2'))
yaml.dump(data, sys.stdout)
"""
users: [user2 key1, user2 key2]
users2: [abc, [user2 group1, user2 group2]]
"""
# Multiline
http://yaml-multiline.info/
# Miscellaneous
strip
"[1,2,3,4]".strip("[1,2,]") # "3,4", any combination
find
"123".find('1') # 0
raw string
r"\x" # "\\x" r'\' # SyntaxError r"\\" # "\\\\" "\s" # "\\s" r"\s" # "\\s" "\f" # \x0c r"\f" # \\f
pathlib
pathlib.Path.cwd() # current work directory pathlib.Path(__file__).resolve() # absolute path of current file p = Path("/home/user/Downloads/repo/test.txt") p.stem # test name without extension p.name # test.txt full name json.loads(pathlib.Path('123.json').read_text()) os.getcwd() os.path.dirname(os.path.realpath(__file__)) # directory os.path.split(os.path.realpath(__file__)) # directory and filename pathlib.Path('123/456/789/12.txt').parents[0] = pathlib.Path('123/456/789/12.txt').parent pathlib.Path('123/456/789/12.txt').parents[1] = pathlib.Path('123/456/789/12.txt').parent.parent
return of
Path
is a special class
it can be used with/
pathlib.Path.cwd() / "dir"
shutil
shutil.copyfileobj(f, sys.stdout) # similar like cat shutil.move(f, target) # target must not exist, and target can in different disk os.rename # target must not exist, and target in the same disk os.replace # target can exist, and target in the same disk Path(f).rename(target) # Unix: target can exist. Target can in different disk. Similar like mv command. Path(f).replace # target can exist in any os. Target can in different disk.
dict
{"a":10}.get(keyname, valueifkeynotexist) {"a": 10}.pop("b") # error {"a": 10}.pop("b", None) # ok
encoding
utf-8 excel contains dummy string, so using utf_8_sig
open()
's encoding is decided by environment'slocale
and can be check bypython -c 'import locale; print(locale.getpreferredencoding())'
, can be set byexport LC_ALL=en_US.UTF-8 export LANG=en_US.UTF-8
python 3.6,
invalid_country.UTF-8
(this country not exsit inlocale -a
) leadspreferredencoding
to be ascii; python 3.7, this problem does not existsys.stdout.encoding
's encoding is decided by environment'slocale
and can be set byPYTHONIOENCODING=UTF-8
b'123'
->'123'
:
b'123'.decode('utf-8')
string decomposition
Takes
⑴
,①
,è
and프
for consideration:unicodedata.normalize('NFD', text)
, canonical equivalence:è
or프
unicodedata.normalize('NFKD', text)
, compatibility equivalence:⑴
,①
,è
and프
unicodedata.decomposition(text)
, special format:⑴
,①
,è
unicodedata.category(char)
can be used to check if a character is unwanted accent markdoc
dir(object)
: check all attributesobject.attr.__doc__
: check how to use?object.attr
in jupyter to check how to useinspect.getsource(object.method)
: check source code
yield
return a generator to save space and some code
def countdown_gen(x): count = x while count > 0: yield count count -= 1 g = countdown_gen(5) for item in g: print(item) # 5, 4, 3, 2, 1
assign a value back to yield
def getPrimes(number): while True: if isPrime(number): number = yield number # generator returns number, and assign value sent to number number += 1 gen = getPrimes(10) gen.send(None) # first sent must be None # next(gen) => gen.send(None) # 11 # execution stuck at yield gen.send(10) # number becomes 10 and starts another round iteration # 11 # execution stuck at yield
def translator(): # load all the words in English language and the translation to 'other lang' my_words_dict = {'hello': 'hello in other language', 'dog': 'dog in other language'} while True: word = yield yield my_words_dict.get(word, 'Unknown word...') my_words_translator = translator() next(my_words_translator) # stuck in first yield print(my_words_translator.send('dog')) # assign value to word and stuck in send yield next(my_words_translator) # stuck in first yield again print(my_words_translator.send('cat'))
throw exception
def add_to_database(connection_string): db = mydatabaselibrary.connect(connection_string) cursor = db.cursor() try: while True: try: row = yield cursor.execute('INSERT INTO mytable VALUES(?, ?, ?)', row) except CommitException: cursor.execute('COMMIT') except AbortException: cursor.execute('ABORT') finally: cursor.execute('ABORT') db.close() db = add_to_database('bla') db.send(None) # opens a database connection db.send('bla') # insert a row db.throw(CommitException) # abort the transaction
yield from: transparent two way channel between the caller and the sub-generator, more info, but one simple usage is generating values from a iterator
def flatten(sequence): """flatten a multi level list or something >>> list(flatten([1, [2], 3])) [1, 2, 3] >>> list(flatten([1, [2], [3, [4]]])) [1, 2, 3, 4] """ for element in sequence: if hasattr(element, '__iter__'): yield from flatten(element) else: yield element
naming variable, package:
identifier ::= (letter|"_") (letter | digit | "_")*
, if breaking rules, interpreter error occursInvalid Import
from importlib import import_module __import__('module-name')
declaration hoisting
There is no hoisting in python
lambda
lambda *x: print(x) # supports var_args
json
json.dump(ensure_ascii=True)
: saves我
into\u6211
, which can be read in any encoding as\u6211
json.loads()
: handles\u6211
as我
json.dump({"text": "我"}, open('123.json', 'w'), ensure_ascii=True) with open('123.json', encoding='latin_1') as reader: # any encoding read_text = reader.read() json_text = json.loads(read_text) print(read_text) # \u6211 print(json_text) # 我 json.dump({"text": "我"}, open('123.json', 'w'), ensure_ascii=False) with open('123.json', encoding='latin_1') as reader: # encoding error read_text = reader.read() json_text = json.loads(read_text) print(read_text) # {"text": "æ"} print(json_text) # {'text': 'æ\x88\x91'}
Take care control characters while
json.loads()
json.loads('{"apple": "good\nfruit"}', strict=False) json.loads(r'{"apple": "good\nfruit"}')
More examples
json.loads('["\\u00b9"]') json.loads('["¹"]')
split
If sep is not specified or is None, a different splitting algorithm is applied: runs of consecutive whitespace are regarded as a single separator, and the result will contain no empty strings at the start or end if the string has leading or trailing whitespace. Consequently, splitting an empty string or a string consisting of just whitespace with a None separator returns [].
' a b c '.split() # ['a', 'b', 'c'] ' a b c '.split(' ') # ['', 'a', 'b', 'c', '']
variable scope
def kk(value): def gg(): print(value) gg() value = 10 gg() kk(3) # 3 # 10
Web →