Professional Documents
Culture Documents
def isLGNumber(t):
if len(t)!=13:
return False
for i in range(0,4):
if not t[i].isdecimal():
return False
if t[4]!="-":
return False
for i in range(5,8):
if not t[i].isdecimal():
return False
if t[8]!="-":
return False
for i in range(9,13):
if not t[i].isdecimal():
return False
return True
print("LG - customer care numbers are of the form dddd-ddd-dddd where d is any number\n\n")
print("Is 1800-315-9999 an LG customer care number? ")
print(isLGNumber('1800-315-9999'))
print("\nIs 1800-315-9998 an LG customer care number? ")
print(isLGNumber('1800-315-9998'))
print("\nIs w800-315-9998 an LG customer care number? ")
print(isLGNumber('w800-315-9998'))
LG - customer care numbers are of the form dddd-ddd-dddd where d is any number
True
True
False
runningText = "LG numbers are 1800-315-9999 and 1800-180-9999 and 800-180-9999"
for i in range(len(runningText)):
extractPhone = runningText[i:i+13]
if isLGNumber(extractPhone):
print("Phone Number Found - " + extractPhone)
print("All possible matches found")
1. Compile : Create a Regex object - the expression that needs to be matched is written within this.
2. Search : Pass the string you want to search - the string from where the expression is to be matched.
3. Group : Call the Match object’s group() method to return a string of the actual matched text.
# Finding patterns o Text with Regular Expressions:
''' steps - compile, search, group '''
# Lg number for reference - 1800-315-9999
import re
lgnumber = re.compile(r'(\d\d\d\d)-(\d\d\d)-(\d\d\d\d)')
s = lgnumber.search("LG customer care for india number is 4800-180-9999")
print("Entire group number: ", s.group()) # group() is same as group(0)
print("First member of group: ", s.group(1))
print("Second member of group: ", s.group(2))
print("Third member of group: ", s.group(3))
print("Using groups instead of group")
group1,group2, group3 = s.groups()
print(group1, group2, group3, sep="\n")
4800
180
9999
The \( and \) escape characters in the raw string passed to re.compile() will match actual parenthesis characters. In regular expressions, the
following characters have special meanings:
.^$*+?{}[]\|()
If you want to detect these characters as part of your text pattern, you need to escape them with a backslash:
. \^ \$ \* \+ \? \{ \} \[ \] \\ \| \( \)
# More on Pattern Matching with Regular Expressions:
# PIPE - "OR" operation
import re
collegeRegex = re.compile(r'bmsit|bmsce')
s = collegeRegex.search("bmsit and bmsce")
print(s.group())
s = collegeRegex.search("bmsis and bmsce")
print(s.group())
bmsit
bmsce
s = collegeRegex.search("bmsis and bmscs")
print(s.group())
# No matches, hence gives an error
---------------------------------------------------------------------------
<ipython-input-70-ffcc446bd934> in <module>()
# More on Pattern Matching with Regular Expressions:
# PIPE - "OR" operation
import re
collegeRegex = re.compile(r'bms(it|ce|wpu|sa|cl)')
s = collegeRegex.search("bmssa has 800 students and bmswpu has 6000 students")
print(s.group())
print(s.group(1))
bmssa
sa
# More on Pattern Matching with Regular Expressions:
# ? = optional matching with ? (matching 0 or 1 instances)
collegeRegex = re.compile(r'Bms( college of)? law')
s = collegeRegex.search("Bms college of law")
print(s.group())
print(s.group(1))
s = collegeRegex.search("Bms law")
print(s.group())
print(s.group(1))
college of
Bms law
None
# More on Pattern Matching with Regular Expressions:
# * = matching 0 or more with "*"
collegeRegex = re.compile(r'Bms( college of)* law')
s = collegeRegex.search("Bms law")
print(s.group())
s = collegeRegex.search("Bms college of law")
print(s.group())
s = collegeRegex.search("Bms college of college of college of law")
print(s.group())
print(s.group(1))
Bms law
college of
# SAME CODE AS ABOVE, BUT + INSTEAD, thus "bms law" gives error
# More on Pattern Matching with Regular Expressions:
# * = matching 0 or more with "*"
# + = matching 1 or more with "+"
collegeRegex = re.compile(r'Bms( college of)+ law')
s = collegeRegex.search("Bms college of law")
print(s.group())
s = collegeRegex.search("Bms college of college of college of law")
print(s.group())
s = collegeRegex.search("Bms law")
print(s.group())
---------------------------------------------------------------------------
<ipython-input-72-ab867c616037> in <module>()
1 s = collegeRegex.search("Bms law")
----> 2 print(s.group())
# More on Pattern Matching with Regular Expressions:
# Matching a specific number of repetitions using {}
bmsRegex = re.compile(r'(Bms){3}')
s = bmsRegex.search("BmsBmsBms")
print(s.group())
# Here matches to ONLY 3 repetitions of the string "Bms"
BmsBmsBms
# GREEDY
import re
bmsRegex = re.compile(r'(Bms){3,5}')
# greedy - thus longest match is considered,
# so if a string with more than 5 "Bms" is searched for,
# it gives back the first 5 occurrences
s = bmsRegex.search("BmsBmsBms")
print(s.group())
s = bmsRegex.search("BmsBmsBmsBmsBmsBmsBms")
print(s.group())
BmsBmsBms
BmsBmsBmsBmsBms
# NON GREEDY - denoted by '?' after {}
import re
bmsRegex = re.compile(r'(Bms){3,5}?')
# nongreedy - thus minimum match is considered,
# so if a string with more than 5 "Bms" is searched for,
# it gives back the first 3 occurrences only
s = bmsRegex.search("BmsBmsBmsBmsBmsBmsBms")
print(s.group())
BmsBmsBms
# NON GREEDY - denoted by '?' after {}
# giving a larger number first in the braces - is wrong syntax
import re
bmsRegex = re.compile(r'(Bms){8,5}?')
# nongreedy - thus minimum match is considered,
# so if a string with more than 5 "Bms" is searched for,
# it gives back the first 3 occurrences only
s = bmsRegex.search("BmsBmsBmsBmsBmsBmsBms")
print(s.group())
---------------------------------------------------------------------------
<ipython-input-6-cae2ba967a08> in <module>()
3 import re
5 frames
/usr/lib/python3.7/sre_parse.py in _parse(source, state, verbose, nested, first)
636 else:
findall() method:
Although in the below code, we have two numbers in the search string, we only get the output for the first occurrence, thus to get all
occurrences of the pattern beyond just the first match is obtained using the findall() method.
import re
lgnumber = re.compile(r'\d\d\d\d-\d\d\d-\d\d\d\d')
s = lgnumber.search("LG numbers are 1800-322-2121 and 1800-223-3211")
print(s.group())
1800-322-2121
lgnumber = re.compile(r'\d\d\d\d-\d\d\d-\d\d\d\d')
s = lgnumber.findall("LG numbers are 1800-322-2121 and 1800-223-3211")
print(s)
['1800-322-2121', '1800-223-3211']
# If there are groups in the regular expression,
# then findall() will return a list of tuples.
# Each tuple represents a found match,
# and its items are the matched strings for each group in the regex.
lgnumber = re.compile(r'(\d\d\d\d)-(\d\d\d)-(\d\d\d\d)')
s = lgnumber.findall("LG numbers are 1800-322-2121 and 1800-223-3211")
print(s)
Character Classes:
Pre-defined character classes :
\d for 0-9
\s for space
\w for alphabets
User-defined character classes can be made by giving the regular expressions in "[ ]"
import re
colRegex = re.compile(r'\d+\s\w+')
# \d+ (1 or more digits) \s (space) \w (1 or more characters)
s = colRegex.findall("Strength is 3850 bmsit, 11000 bmsce, 680 bmssa")
print(s)
# Making your own character class / Formation of character classes :
vowelRegex = re.compile(r'[aeiouAEIOU]')
s = vowelRegex.findall("Bmsit is a great college")
print("Vowels in given string: " , s)
consonantRegex = re.compile(r'[^aeiouAEIOU ]')
s = consonantRegex.findall("Bmsit is a great college")
print("Consonants in given string: " , s)
# ^ is used for negation
Vowels in given string: ['i', 'i', 'a', 'e', 'a', 'o', 'e', 'e']
Consonants in given string: ['B', 'm', 's', 't', 's', 'g', 'r', 't', 'c', 'l', 'l', 'g']
Usually we surround the regex search within "if" statements to avoid exceptions, and instead handle them or print a error
message.
import re
searchforbmsitregex = re.compile(r'bmsit')
if searchforbmsitregex.search("Hello bmsit 5th semester students") == None:
print("No matches found")
else:
print(searchforbmsitregex.search("Hello bmsit 5th semester students").group())
bmsit
But when the caret symbol is used outside the character class, it denotes the starting word. That is, to check whether bmsit is the starting
word of the statement given, we can compile the regex expression using "^bmsit"
The dollar symbol is used to check if the string/sentence ends with the given expression.
import re
searchforbmsit_asStartingWord_regex = re.compile(r'^bmsit')
if searchforbmsit_asStartingWord_regex.search("Hello bmsit 5th semester students") == None:
print("No matches found")
else:
print(searchforbmsit_asStartingWord_regex.search("Hello bmsit 5th semester students").group())
No matches found
endsWithNumberRegex = re.compile(r'\d$')
if endsWithNumberRegex.search("Your lucky number is 7") == None:
print("String doesn't end with a number")
else:
print("Number found at the end:", endsWithNumberRegex.search("Your lucky number is 7").group())
import re
noNewLineRegex = re.compile('.*')
print(noNewLineRegex.search('it\nce\nsa\nwpu\nlaw').group())
print("***************")
noNewLineRegex = re.compile('.*', re.DOTALL)
# understands that in the searchable string if any wildcharacters occur,
# it'll ignore it and continue searching
print(noNewLineRegex.search('it\nce\nsa\nwpu\nlaw').group())
it
***************
it
ce
sa
wpu
law
* --> Zero or more
+ --> one or more
? --> zero or one
{5} --> matches exactly 5 characters
{5,} --> matches 5 or more characters
{,5} --> matches upto maximum 5 characters
{2,5} --> matches from 2-5 characters/repetitions
\d --> matches a digit
\w --> matches a character (word)
\s --> matches space
\D --> matches anything except a digit
\W --> matches anything except a alphabet
\S --> matches anything except a space
[aeiouAEIOU] --> matches any character from this
character class, here looks for a,e,i,o,u,A,E,I,O,U
[^aeiouAEIOU] --> matches any character NOT PRESENT in
the given character class, here not a vowel.
The . matches any character, except newline characters.
^xyz --> matches the expression IFF it is the starting
sequence of the given sentence.
xyz$ --> matches the expression IFF it is the ending
sequence of the given sentence.
Case-Insensitive Matching re.I
# CASE SENSITIVE EXAMPLE:
import re
colRegex = re.compile(r'bmsit')
print(colRegex.search("BMSIT BMSit bmsit BMsit bmSit bmsit bmsit").group())
# Search will stop after finding the first occurrence of bmsit
print("******************")
print(colRegex.findall("BMSIT BMSit bmsit BMsit bmSit bmsit bmsit"))
# Findall searches for all occurrences of the expression
bmsit
******************
# CASE INSENSITIVE EXAMPLE:
import re
colRegex = re.compile(r'bmsit',re.I)
print(colRegex.search("BMSIT BMSit bmsit BMsit bmSit bmsit bmsit").group())
# Search will stop after finding the first occurrence of bmsit
print("******************")
print(colRegex.findall("BMSIT BMSit bmsit BMsit bmSit bmsit bmsit"))
# Findall searches for all occurrences of the expression
BMSIT
******************
import re
colRegex = re.compile(r'bmsit')
print(colRegex.sub('BMS institute of Technology','bmsit is great'))
# Replaces every occurrence of 'bmsit' in the second argument of 'sub'
# with first argument.
colRegex1 = re.compile(r'bmsit (\w)\w*')
print(colRegex1.sub(r'\'|***','bmsit cse is great'))
\'|*** is great
import re
emails = ['abc@xyz.com','sss@yahoo.in','zzz@yamaha.biz']
gmailSubstitute = [re.sub(r'@\w+\.(\w+)','@gmail.com',x) for x in emails]
print(gmailSubstitute)
Verbose :
Python comments written using '#' inside the search pattern is not correct as the re tries to match '#' string and everything after that too.
Thus if something within the search pattern is to be ignored and taken as python comments, we use re.VERBOSE
Similarly with spaces between the comments or the end of the pattern.
# Without VERBOSE
import re
phoneRegex = re.compile(r'\d{3}-\d{3}-\d{4} # this is trying to find LG phone numbers')
print(phoneRegex.findall('Lg\'s India\'s Phone number is 000-521-7878 \nLG\'s SriLanka\'s Phone number is 121-454-6969'))
[]
# With VERBOSE
import re
phoneRegex = re.compile(r'\d{3}-\d{3}-\d{4} # this is trying to find LG phone numbers', re.VERBOSE)
print(phoneRegex.findall('Lg\'s India\'s Phone number is 000-521-7878 \nLG\'s SriLanka\'s Phone number is 121-454-6969'))
['000-521-7878', '121-454-6969']
# Combining re .IGNORECASE, and re .VERBOSE
import re
phoneRegex = re.compile(r'\d{3}-\d{3}-\d{4} # this is trying to find LG phone numbers',re.VERBOSE)
print(phoneRegex.findall('LG India\'s Phone number is 000-521-7878 and LG Srilanka\'s Phone number is 121-454-6969'))
phoneRegex2 = re.compile(r'bmsit # this is how you can combine ignorecase and verbose',re.IGNORECASE | re.VERBOSE)
print(phoneRegex2.findall('bmsit is located at yelahanka and BMSIT is having a strength of 4000 students'))
['000-521-7878', '121-454-6969']
['bmsit', 'BMSIT']
phoneRegex = re.compile(r'''(
(\d{3}|\(\d{3}\))? # area code
(\s|-|\.)? # separator
\d{3} # first 3 digits
(\s|-|\.) # separator
\d{4} # last 4 digits
(\s*(ext|x|ext.)\s*\d{2,5})? # extension
)''', re.VERBOSE)
Contact Us
No Starch Press, Inc.
245 8th Street
San Francisco, CA 94103 USA
Phone: 800.420.7240 or +1 415.863.9900 (9 a.m. to 5 p.m., M-F, PST)
Fax: +1 415.863.9950
Reach Us by Email
General inquiries: info@nostarch.com
Media requests: media@nostarch.com
Academic requests: academic@nostarch.com (Further information)
Conference and Events: conferences@nostarch.com
Help with your order: info@nostarch.com
!pip install pyperclip
import pyperclip, re
phoneRegex = re.compile(r'''(
(\d{3}|\(\d{3}\))? # area code
(\s|-|\.)? # separator
(\d{3}) # first 3 digits
(\s|-|\.) # separator
(\d{4}) # last 4 digits
(\s*(ext|x|ext.)\s*(\d{2,5}))? # extension
)''', re.VERBOSE)
# Create email regex.
emailRegex = re.compile(r'''(
[a-zA-Z0-9._%+-]+ # username
@ # @ symbol
[a-zA-Z0-9.-]+ # domain name
(\.[a-zA-Z]{2,4}) # dot-something
)''', re.VERBOSE)
# Find matches in clipboard text.
text = str(pyperclip.paste())
matches = []
for groups in phoneRegex.findall(text):
phoneNum = '-'.join([groups[1], groups[3], groups[5]])
if groups[8] != '':
phoneNum += ' x' + groups[8]
matches.append(phoneNum)
for groups in emailRegex.findall(text):
matches.append(groups[0])
# Copy results to the clipboard.
if len(matches) > 0:
pyperclip.copy('\n'.join(matches))
print('Copied to clipboard:')
print('\n'.join(matches))
else:
print('No phone numbers or email addresses found.')
include path
include os
# Converting to path :
from pathlib import Path
print("Converting to a path -->", Path('bmsitWebsite','resources','views','placement.blade.php'))
# Add some files in a loop
FileList = ['faculty.php','career.php','contact.php']
for file in FileList:
print(Path(r'D:\bmsitWebsite\resources\views',file))
D:\bmsitWebsite\resources\views/faculty.php
D:\bmsitWebsite\resources\views/career.php
D:\bmsitWebsite\resources\views/contact.php