Metacharacters allow you to construct patterns that match specific
sequences
of characters within strings.
^: Matches the beginning of a string or the beginning of a line in
multiline mode.
$: Matches the end of a string or the end of a line in multiline mode.
\A: Matches the beginning of a string (similar to ^ but does not
depend on multiline mode).
\Z: Matches the end of a string (similar to $ but does not depend on
multiline mode).
.: Matches any single character except newline characters \n.
*: Matches zero or more occurrences of the preceding character or
group.
+: Matches one or more occurrences of the preceding character or
group.
?: Matches zero or one occurrence of the preceding character or group
(makes it optional).
[]: Defines a character class, matches any single character inside the
brackets.
|: Acts as a logical OR, matches either the pattern on its left or the
pattern on its right.
(): Groups patterns together, allows applying quantifiers to multiple
characters or patterns.
\: Escapes metacharacters, allows using them as literal characters
(e.g., \., \\).
Unit 2, Chapter 2
Regular Expressions match and search methods
# to check whether a particular string is present in input string or
not
# we can use in operator
txt="Python is easy programming language"
print("Python" in txt)
print("python" in txt)
print(txt.find("hon"))
txt.find("xyz")
import re
txt="Python is easy programming language"
print("Match method to check python present are not")
print(re.match("Python",txt))
print()
print("Search method")
print(re.search("Python",txt))
print("Match method")
print(re.match("Perl",txt))
print()
print("Search method")
print(re.search("Perl",txt))
print("Match method to check python present are not")
print(re.match("programming",txt))
print()
print("Search method")
print(re.search("programming",txt))
# match method - checks only in the beginning,
#if it is not present in the beginning it will return you None
# search method - check in any part of the string
#if it is not present, it returns None
print("Match method")
print(re.match("Perl",txt))
print()
print("Search method")
print(re.search("Perl",txt))
print(re.match('P\w\w\w\w\w',txt))
#\w-alphanumeric- alphabet, number, underscore
print(re.match('p\w\w\w\w\w',txt))
print(re.search('P\w\w\w\w\w',txt))
print(re.search('p\w\w\w\w\w',txt))
txt="RE_VA university"
print(re.match('R\w\w',txt))
txt="RE-VA University"
print(re.match('R\w\w',txt))
txt="Python is easy programming language"
print(re.search('p\w+',txt))
txt="Python is easy p+rogramming language"
print(re.search('p\w+',txt))
txt="Python and Perl are programming languages"
print(re.search('P\w+',txt))
sword=re.search('P\w+',txt)
print(sword)
print()
print(sword.group())
mword=re.match('P\w+',txt)
print(mword)
print()
print(mword.group())
#replacing a string
#to replace the string by replace method, no package needed
txtre="Red Lorry Yellow Lorry"
xre=txtre.replace("Lorry","Bus")
print(xre)
print(txtre)
# replacing the string with Sub (substitution) method
# to use sub method import re
#make a note of syntax difference between sub and replace methods
import re
txt="Red Lorry Yellow Lorry"
x=re.sub("Lorry","Bus",txt)
print(txt)
print(x)
#split method
import re
txt="Red Lorry Yellow Lorry"
x=re.split("\s",txt)
print(x)
txt="Red Lorry,Yellow Lorry"
x=re.split(",",txt)
print(x)
txt="Red+Lorry-Yellow#Lorry"
x=re.split("[-+#]",txt)
print(x)
txt="RedBigLorry-YellowSmallLorry"
x=re.split("Big|Small|-",txt)
print(x)
txt="RedBigLorry-YellowSmallLorry"
x=re.split("[BigSmall-]",txt)
print(x)
txt="Red Lorry Yellow Lorry"
print(re.findall("Lorry",txt))
x=re.findall("Lorry",txt)
print(x)
print(re.search("Lorry",txt))
import re
txt="Python and Perl are programming languages"
print(re.findall('P\w+',txt))
txt="Python, PHP_ and Perl are programming languages"
print(re.findall('P\w\w\w',txt))
import re
txt="22-March_ 20"
# find all alphabet characters between "a" and "z
x=re.findall("[a-z]",txt)
print(x)
# find all lower case characters between"a" and "d"
x=re.findall("[a-d]",txt)
print(x)
print(re.findall("hello world", "hello"))
print(re.findall("hello","hello world"))
txt="4th semester"
x=re.findall("[sem]",txt)
print(x)
print(re.findall("sem",txt))
print(re.match("sem",txt))
print(re.search("sem",txt))
re.findall("\d",txt)
re.findall("^4",txt)
# ^ search for the occurence of the expression at the beginning of the
string
re.findall("4",txt)
re.findall("\A4",txt)
# same as ^, \A specifically matches only at the beginning of the
entire string
^ matches the beginning of the string and also matches the beginning
of
each line in a multiline string if the re.MULTILINE flag is not set.
\A strictly matches the beginning of the string and does not consider
the beginning of each line in a multiline string.
txt="Python, PHP and Perl are Programming Languages"
print(re.findall('P\w\w\w',txt))
txt="Python, PHP and Perl are Programming Languages"
print(re.match('p\w\w\w',txt))
txt="Python, PHP and Perl are Programming Languages"
print(re.search('P\w\w\w',txt))
meta characters
. - any one character
+ - one time or more times
? - zero or one
* - zero or one or many
import re
result=re.findall('.','Python, PHP and Perl are Programming
Language.')
print (result)
import re
result=re.findall('\w','Python, PHP and Perl are Programming
Language.')
print (result)
#extract each word (* or +)
import re
result=re.findall('\w*','Python, PHP and Perl are Programming
Languages')
print (result)
import re
result=re.findall('\w+','Python, PHP and Perl are Programming
Languages')
print (result)
#fetch first word
result=re.findall('^\w+','Python, PHP and Perl are Programming
Languages')
print (result)
#fetch last word
#If we will use “$” instead of “^”, it will return the word from the
end
import re
result=re.findall('\w+$','Python, PHP and Perl are Programming
Languages')
print (result)
result=re.findall('\w+$','Python, PHP and Perl are Programming
Languages.')
print (result)
#Return the first two character of each word
#Extract consecutive two characters of each word, excluding spaces
(using “\w“)
result=re.findall('\w\w','Python, PHP and Perl are Programming
Languages')
print (result)
result1=re.findall('\w.','Python, PHP and Perl are Programming
Languages')
print (result1)
#Extract consecutive two characters those available at start of word
boundary (using “\b“)
result1=re.findall(r'\b\w{2}','Python, PHP and Perl are Programming
Languages')
#or
#result1=re.findall(r'\b\w\w','Python, PHP and Perl are Programming
Languages')
print (result1)
#Python raw string is created by prefixing a string literal with ‘r’
or ‘R’.
#Python raw string treats backslash (\) as a literal character.
#This is useful when we want to have a string that contains backslash
#and don’t want it to be treated as an escape character.
s = 'Hi\nHello'
print(s)
raw_s = r'Hi\nHello'
print(raw_s)
s ='Hi\xHello'
print(s)
s = r'Hi\xHello'
print(s)
txt="trying to print \n character"
print(txt)
txt="trying\\ to print \\n char\\acter"
print(txt)
txt="trying to print \\n character \t without \\ "
print(txt)
import re
txt="Python\tprogramming"
print(txt)
print(re.search("on\s",txt))
txt=r"Python\tprogramming"
print(txt)
print(re.search("on\s",txt))
# \b
# Returns a match where the specified characters are at the beginning
or at the end of a word
# (the "r" in the beginning is making sure that the string is being
treated as a "raw string")
import re
txt="The cat sat on The mat!"
#check if the string starts with "The"
#check if the string starts with "the"
#check if the string ends with "mat"
#check if the string ends with"mat!"
# ^ beginning, $ end
import re
tnt="the clever fox\nthe not so clever fox\nthe blue box"
print(re.findall("^the",tnt,re.MULTILINE))
tnt="the clever fox \n the not so clever fox \n the blue box"
print(re.findall("^the",tnt,re.MULTILINE))
tnt="the clever fox \n the not so clever fox \n the blue box"
print(re.findall("^the",tnt,re.MULTILINE))
tnt="clever fox \n the not so clever fox \n the blue box"
print(re.findall("^the",tnt,re.MULTILINE))
tnt="clever fox \n the not so clever fox \n the blue box"
print(re.findall("^the",tnt,re.MULTILINE))
#write a program to search for a programmig language from a string
#"Python is a easy programming language"
import re
txt="Python is a easy programming language"
sword=re.search("prog\w+\s\w+",txt)
print(sword)
print(sword.group())
import re
txt="Python is a easy programming language"
sword=re.search("prog\w+\s\w+",txt)
#sword=re.search("prog\w+",txt)
print(sword)
print(sword.group())
if sword:
print("Substring found")
print(sword.group())
else:
print("Not found")
• \W - non alphabet, non digit, not an underscore
• \w - alphabet, digit, an underscore
• \s - space
• \S - not a space
• \d - a digit
• \D - not a digit
Repeaters : * , + and { } : These symbols act as repeaters and tell the computer that the
preceding character is to be used for more than just one time.
•
– one or more than one time
•
– zerp or more times ? - optional ^ - must start at beginning of the string $ - must
end with {} - as many times as the value inside this bracket {2} means that the
preceding character is to be repeated 2 times, {min,} means the preceding
character is matches min or more times. {min,max} means that the preceding
character is repeated atleast min & at most max times
re.split('\W','Hello,hello-hello_hello')
re.split('\W','Hello,hello-hello_hello',maxsplit=1)
re.split('\W','Hello,hello-hello_hello',maxsplit=2)
re.split('\W','Hello,hello-hello_hello',maxsplit=3)
re.split('\W','Hello,hello-hello_hello',maxsplit=4)
Python Raw String
https://www.youtube.com/watch?v=kkTZ0EZws9Y&t=89s
Python Regular Expression - 1 - match, search
https://www.youtube.com/watch?v=kDEOZvjazLs&t=21s
Python Regular Expression - 2 - Sub, Split
https://www.youtube.com/watch?v=EYg1TBpWnYA&t=14s
Python Regular Expression - 3 - findall
https://www.youtube.com/watch?v=wWuX1BTaYoY&t=10s
Python Regular Expression - 4 - metacharacters ( ^, $ )
https://www.youtube.com/watch?v=Yukl9rp6xA0&t=4s
Python Regular Expression - 5 - metacharacters ( ^, $, \A, \Z )
https://www.youtube.com/watch?v=u68ZQ-pDz1Y&t=15s
Python Regular Expression - 6 - metacharacters -repetitors ( . + ? *)
https://www.youtube.com/watch?v=nP_BAaCsjS4&t=4s
Python Regular Expression - 7 split function with maxsplit parameter
https://www.youtube.com/watch?v=AI29X9Ok850
Python - Extracting Phone Number From A String
https://www.youtube.com/watch?v=7rmlWK8DD5Y
Python- Roman Number Hundreds & Fifties
https://www.youtube.com/watch?v=UT93oddqoMg
# \b
import re
txt="The rain in spain"
#x=re.findall(r"\bain",txt) #\bin #\brain #ain\b
x=re.findall(r"ain\b",txt)
print(x)
if x:
print("Yes, match found")
else:
print("Not found")
['ain', 'ain']
Yes, match found
#Return the domain type of given email-ids
str= 'abc.test@gmail.com, xyz@test.in, test.first@analyticsvidhya.com,
first.test@rest.biz'
x=re.findall('@(\w+.\w+)',str)
print(x)
x=re.findall('@\w+.(\w+)',str)
print(x)
# Extract all characters after “@”
#1. output: ['@gmail.com', '@test.in', '@analyticsvidhya.com',
'@rest.biz']
['gmail.com', 'test.in', 'analyticsvidhya.com', 'rest.biz']
['com', 'in', 'com', 'biz']
#Return the domain type of given email-ids
str= 'abc.test@gmail.com, xyz@test.in, test.first@analyticsvidhya.com,
first.test@rest.biz'
# Extract all characters after “@”
1. output: ['@gmail.com', '@test.in', '@analyticsvidhya.com',
'@rest.biz']
# Extract only domain name
2. output: ['com', 'in', 'com', 'biz']
1. r'@\w+.\w+
2. r'@\w+.(\w+)
str1= 'abc.test@gmail.com, xyz@test.in,
test.first@analyticsvidhya.com, first.test@rest.biz'
res=re.findall(r'@\w+.\w+',str1)
print(res)
['@gmail.com', '@test.in', '@analyticsvidhya.com', '@rest.biz']
str1= 'abc.test@gmail.com, xyz@test.in,
test.first@analyticsvidhya.com, first.test@rest.biz'
res=re.findall(r'@\w+.(\w+)',str1)
print(res)
['com', 'in', 'com', 'biz']
r'@\w+.\w+': This regular expression matches an @ symbol followed by
one or more word characters (\w+),
then any single character (.), and finally, another word character
(\w+).
The . in the middle matches any character except a newline.
r'@\w+.(\w+)': This regular expression matches an @ symbol followed by
one or more word characters (\w+),
then a literal dot (.), and finally, one or more word characters
(\w+).
However, the difference here is that the parentheses around the
second \w+ create a capturing group.
This means that only the characters matched by the second \w+
(i.e., the characters immediately following the dot)
will be captured as a group.
# Return date from given string
str= 'Amit 34-3456 12-05-2007, XYZ 56-4532 11-11-2011, ABC 67-8945 12-
01-2009'
# Extract complete date
1. Output: ['12-05-2007', '11-11-2011', '12-01-2009']
# Extract only year
2. Output: ['2007', '2011', '2009']
# Return all words of a string those starts with vowel
str= 'AV is largest Analytics community of India'
# Return words starts with vowels
Output: ['AV', 'is', 'Analytics', 'of', 'India']
#Return words starts with consonents
Output: ['largest', 'community']
1. r'\b[aeiouAEIOU]\w+
2. r'\b[^aeiouAEIOU ]\w+
# Street Address
s = '100 NORTH MAIN ROAD'
s.replace('ROAD', 'RD.')
'100 NORTH MAIN RD.'
s = '100 NORTH BROAD ROAD'
# -4-3-2-1
s.replace('ROAD', 'RD.')
'100 NORTH BRD. RD.'
print(s[-4])
R
# but want to replace only last occurence
s = '100 NORTH BROAD ROAD'
s[:-4] + s[-4:].replace('ROAD', 'RD.')
'100 NORTH BROAD RD.'
s[-4:]+s[-5:].replace('ROAD','RD.')
print(s)
100 NORTH BROAD ROAD
re.sub('ROAD$', 'RD.', s)
'100 NORTH BROAD RD.'
#validating phone number
import re
str1="This is my number +919123456789 and +919999999999"
res=re.search('\+91\d{10}',str1)
print(res.group())
+919123456789
import re
str1="This is my number +919123456789 and +919999999999"
res=re.findall('\+91\d{10}',str1)
#print(res)
if (res):
print("pattern matched")
print(res)
else:
print("Not matched")
pattern matched
['+919123456789', '+919999999999']
import re
str1="This is my number +919123456789 and +91-9999999999"
res=re.findall('\+91\-?\d{10}',str1)
#print(res)
if (res):
print("pattern matched")
print(res)
else:
print("Not matched")
pattern matched
['+919123456789', '+91-9999999999']
str2="This is my number is 080-12345678 and account number is KAT080-
22222222 and another number is 080-99999999"
print(re.findall('\d{3}-\d{8}',str2))
print(re.findall(r'\b\d{3}-\d{8}\b',str2))
['080-12345678', '080-22222222', '080-99999999']
['080-12345678', '080-99999999']
100 - C 150 - CL
200 - CC 250 - CCL
300 - CCC 350 - CCCL
400 - CD 450 - CDL
500 - D 550 - DL
600 - DC 650 - DCL
700 - DCC 750 - DCCL
800 - DCCC 850 - DCCCL
900 - CM 950 - CML
import re
ronum=input("enter roman number to match with hundred")
#?- 0 or 1 occurence-900|400|100,200,300,500,600,700,800
if(re.search('^(CM|CD|D?C?C?C?)$',ronum)):
print(ronum," is a valid roman number")
print("and match with hundred and multiple of hundreds")
else:
print (ronum, "is not mathcing with hundred in roman number")
enter roman number to match with hundredCD
CD is a valid roman number
and match with hundred and multiple of hundreds
import re
ronum=input("enter roman number to match with hundred")
#?- 0 or 1 occurence-900|400|100,200,300,500,600,700,800
if(re.search('^(CM|CD|D?C{0,3})L?$',ronum)):
print(ronum," is a valid roman number")
print("and match with hundred and multiple of hundreds")
else:
print (ronum, "is not mathcing with hundred in roman number")
enter roman number to match with hundredCM
CM is a valid roman number
and match with hundred and multiple of hundreds
# Verbose Regular Expressions
# pattern = '^M?M?M?M?(CM|CD|D?C?C?C?)(XC|XL|L?X?X?X?)(IX|IV|V?I?I?
I?)$'
# pattern = '^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})
$'
# very complicated
# chances are there to forget
# comments and white spaces are ignored
pattern = """
^ # beginning of string
M{0,4} # thousands - 0 to 4 M's
(CM|CD|D?C{0,3}) # hundreds - 900 (CM), 400 (CD), 0-300 (0
to 3 C's),
# or 500-800 (D, followed by 0 to 3 C's)
(XC|XL|L?X{0,3}) # tens - 90 (XC), 40 (XL), 0-30 (0 to 3
X's),
# or 50-80 (L, followed by 0 to 3 X's)
(IX|IV|V?I{0,3}) # ones - 9 (IX), 4 (IV), 0-3 (0 to 3
I's),
# or 5-8 (V, followed by 0 to 3 I's)
$ # end of string
"""
re.search(pattern, 'M', re.VERBOSE)
re.search(pattern, 'MCMLXXXIX', re.VERBOSE)
<re.Match object; span=(0, 9), match='MCMLXXXIX'>
#validating phone number
import re
str1="This is my number +919123456789 and +919999999999"
res=re.search('\+91\d{10}',str1)
print(res)
<re.Match object; span=(18, 31), match='+919123456789'>
Apply the phone pattern search method to search the following phone
patters.
i) 800-555-1212 ext. 1234
ii.) work 1-(800) 555.1212 #1234
iii) 800-555-1212
iv) 800.555.1212 Each one
2 marks
import re
def search_phone_patterns(text):
# Define the regular expression pattern for phone numbers
pattern = r'\b(?:\d{3}[-.]|\(\d{3}\)\s*)\d{3}[-.]\d{4}\b(?:\
s*ext\. \d+)?'
# Search for phone number patterns in the text
matches = re.findall(pattern, text)
# Print the matches
print("Phone number patterns found:")
for match in matches:
print(match)
# Given phone patterns
patterns = [
"800-555-1212 ext. 1234",
"work 1-(800) 555.1212 #1234",
"800-555-1212",
"800.555.1212"
]
# Apply phone pattern search method to each pattern
for pattern in patterns:
search_phone_patterns(pattern)