Table of contents
python regex
search
basic usage
import re
regex = '[0-9+]'
string = 'abcd 12345 efgh 67890'
match = re.search(regex, string)
if match:
print(match.group()) # prints 12345
match object fields
match = re.search(regex, string)
if match:
print(match.group()) # prints matching part of input
print(match.span()) # prints tuple: first index of match & last index
print('input string:' + match.string)
using flags
match = re.search(regex, line, re.IGNORECASE)
match = re.search(regex, line, re.DOTALL | re.MULTILINE)
extracting groups
string = 'someone@example.com'
regex = '(?:\w+)@(.+)'
result = re.search(regex, line)
print(result.group(0)); # prints: someone@example.com
print(result.group(1)); # prints: @example.com
print(result.group(2)); # error: there is no group 2
findall
printing all matches of regex in string
regex = '[0-9+]'
string = 'abcd 12345 efgh 67890'
result = re.findall(regex, string)
print(result); # prints ['12345', '67890']
split
splitting a line on a regex
regex = '-'
string = '01-01-1970'
result = re.split(regex, string)
print(result); # prints ['01', '01', '1970']
split a limited number of times
regex = '-'
string = '01-01-1970'
result = re.split(regex, string, 1)
print(result); # prints ['01', '01-1970']
substitution
substitute 'x' for numbers
string = 'ccv 123'
regex = '[0-9]'
replace = 'x'
result = re.sub(regex, replace, string)
print(result); # print ccv xxx
only replace first 2
result = re.sub(regex, replace, string, 2)
print(result); # print ccv xx3
substitute with multiline flag
return re.sub(r'\n\s*\n', '\n', lines, re.MULTILINE)
flags
A | ASCII # ascii only matching (instead of unicode)
S | DOTALL # dot '.' matches newline
I | IGNORECASE # case insensitive matching
M | MULTILINE # multiline matching
syntax
a # match the letter a
\. # match a literal dot '.'
\\ # match backslash
^ # match start line
$ # match end of line
. # match any character other than newline ('s' flag set: also match newline)
\d # match digit
\D # match non-digit
\s # match whitespace
\w # match [a-zA-Z0-9_] (word character)
\W # match [^a-zA-Z0-9_] (non-word character)
| # alternative pattern (logical or)
character class
[0-9] # match digit
[^0-9] # match non-digit
[0-9-] # match digit or hyphen '-'
[-0-9] # match digit or hyphen '-'
[0-9.] # match digit or dot
[aA] # match 'a' or 'A'
multiplicity
? # zero or one, greedy
?? # zero or one, lazy
* # zero or more, greedy
*? # zero or more, lazy
+ # one or more, greedy
+? # one or more, lazy
{n} # exactly n matches
{n,} # n or more matches, greedy
{n,}? # n or more matches, lazy
{n,m} # n to m matches, greedy
{n,m}? # n to m matches, lazy
capturing
( # open capturing group
) # close capturing group
(?: # non-capturing group
\n # refer to captured group [n]
look around
(?= # Lookahead
(?<= # Lookbehind
(?! # Negative Lookahead (must not be followed by)
(?<! # Negative Lookbehind (must not be preceded by)
documentation
https://docs.python.org/3/howto/regex.html
https://docs.python.org/3/library/re.html#module-re