Index

Table of contents

python regex

search

basic usage
import re
regex = '[0-9+]'
string = 'abcd 12345 efgh 67890'
match = re.search(regex, string)
if match:
	print(match.group())    # prints 12345
match object fields
match = re.search(regex, string)
if match:
	print(match.group())   # prints matching part of input
        print(match.span())    # prints tuple: first index of match & last index
        print('input string:' + match.string)
using flags
match = re.search(regex, line, re.IGNORECASE)
match = re.search(regex, line, re.DOTALL | re.MULTILINE)
extracting groups
string = 'someone@example.com'
regex = '(?:\w+)@(.+)'
result = re.search(regex, line)
print(result.group(0));    # prints: someone@example.com
print(result.group(1));    # prints: @example.com
print(result.group(2));    # error: there is no group 2

findall

printing all matches of regex in string
regex = '[0-9+]'
string = 'abcd 12345 efgh 67890'
result = re.findall(regex, string)
print(result);    # prints ['12345', '67890']

split

splitting a line on a regex
regex = '-'
string = '01-01-1970'
result = re.split(regex, string)
print(result);    # prints ['01', '01', '1970']
split a limited number of times
regex = '-'
string = '01-01-1970'
result = re.split(regex, string, 1)
print(result);    # prints ['01', '01-1970']

substitution

substitute 'x' for numbers
string = 'ccv 123'
regex = '[0-9]'
replace = 'x'
result = re.sub(regex, replace, string)
print(result);    # print ccv xxx
only replace first 2
result = re.sub(regex, replace, string, 2)
print(result);    # print ccv xx3
substitute with multiline flag
return re.sub(r'\n\s*\n', '\n', lines, re.MULTILINE)

flags

A | ASCII      # ascii only matching (instead of unicode)
S | DOTALL     # dot '.' matches newline
I | IGNORECASE # case insensitive matching
M | MULTILINE  # multiline matching

syntax

a         # match the letter a
\.        # match a literal dot '.'
\\        # match backslash
^         # match start line
$         # match end of line

.         # match any character other than newline ('s' flag set: also match newline)
\d        # match digit
\D        # match non-digit
\s	  # match whitespace
\w        # match [a-zA-Z0-9_] (word character)
\W        # match [^a-zA-Z0-9_] (non-word character)
|         # alternative pattern (logical or)
character class
[0-9]	  # match digit
[^0-9]    # match non-digit
[0-9-]	  # match digit or hyphen '-'
[-0-9]    # match digit or hyphen '-'
[0-9.]    # match digit or dot
[aA]      # match 'a' or 'A'
multiplicity
?         # zero or one, greedy
??        # zero or one, lazy
*         # zero or more, greedy
*?        # zero or more, lazy
+         # one or more, greedy
+?        # one or more, lazy
{n}       # exactly n matches
{n,}      # n or more matches, greedy
{n,}?     # n or more matches, lazy
{n,m}     # n to m matches, greedy
{n,m}?    # n to m matches, lazy
capturing
(         # open capturing group
)         # close capturing group
(?:       # non-capturing group
\n        # refer to captured group [n]
look around
(?=	  # Lookahead
(?<=      # Lookbehind
(?!	  # Negative Lookahead (must not be followed by)
(?<!      # Negative Lookbehind (must not be preceded by)

documentation

https://docs.python.org/3/howto/regex.html
https://docs.python.org/3/library/re.html#module-re