You are on page 1of 10

REGULAR EXPRESSIONS

Without using a regular expression

def isLGNumber(t):
  if len(t)!=13:
    return False
  for i in range(0,4):
    if not t[i].isdecimal():
      return False
  if t[4]!="-":
    return False
  for i in range(5,8):
    if not t[i].isdecimal():
      return False
  if t[8]!="-":
    return False
  for i in range(9,13):
    if not t[i].isdecimal():
      return False
  return True

print("LG - customer care numbers are of the form dddd-ddd-dddd where d is any number\n\n")
print("Is 1800-315-9999 an LG customer care number? ")
print(isLGNumber('1800-315-9999'))

print("\nIs 1800-315-9998 an LG customer care number? ")
print(isLGNumber('1800-315-9998'))

print("\nIs w800-315-9998 an LG customer care number? ")
print(isLGNumber('w800-315-9998'))

LG - customer care numbers are of the form dddd-ddd-dddd where d is any number

Is 1800-315-9999 an LG customer care number?

True

Is 1800-315-9998 an LG customer care number?

True

Is w800-315-9998 an LG customer care number?

False

runningText = "LG numbers are 1800-315-9999 and 1800-180-9999 and 800-180-9999"

for i in range(len(runningText)):

    extractPhone = runningText[i:i+13]

    if isLGNumber(extractPhone):

        print("Phone Number Found - " + extractPhone)

print("All possible matches found")

Phone Number Found - 1800-315-9999

Phone Number Found - 1800-180-9999

All possible matches found

Using regular expressions for the same problem :


There are three steps for pattern matching, after importing the regex module with import re :

1. Compile : Create a Regex object - the expression that needs to be matched is written within this.
2. Search : Pass the string you want to search - the string from where the expression is to be matched.
3. Group : Call the Match object’s group() method to return a string of the actual matched text.

# Finding patterns o Text with Regular Expressions: 

''' steps - compile, search, group '''

# Lg number for reference - 1800-315-9999

import re

lgnumber = re.compile(r'(\d\d\d\d)-(\d\d\d)-(\d\d\d\d)')

s = lgnumber.search("LG customer care for india number is 4800-180-9999")

print("Entire group number: ", s.group())    # group() is same as group(0)

print("First member of group: ", s.group(1))

print("Second member of group: ", s.group(2))

print("Third member of group: ", s.group(3))

print("Using groups instead of group")

group1,group2, group3 = s.groups()

print(group1, group2, group3, sep="\n")

Entire group number: 4800-180-9999

First member of group: 4800

Second member of group: 180

Third member of group: 9999

Using groups instead of group

4800

180

9999

The \( and \) escape characters in the raw string passed to re.compile() will match actual parenthesis characters. In regular expressions, the
following characters have special meanings:

.^$*+?{}[]\|()

If you want to detect these characters as part of your text pattern, you need to escape them with a backslash:

. \^ \$ \* \+ \? \{ \} \[ \] \\ \| \( \)

PIPE - "OR" operation

# More on Pattern Matching with Regular Expressions: 

# PIPE - "OR" operation

import re

collegeRegex = re.compile(r'bmsit|bmsce')

s = collegeRegex.search("bmsit and bmsce")

print(s.group())

s = collegeRegex.search("bmsis and bmsce")

print(s.group())

bmsit

bmsce

s = collegeRegex.search("bmsis and bmscs")

print(s.group())        

# No matches, hence gives an error

---------------------------------------------------------------------------

AttributeError Traceback (most recent call last)

<ipython-input-70-ffcc446bd934> in <module>()

1 s = collegeRegex.search("bmsis and bmscs")

----> 2 print(s.group()) # No matches, hence gives an error

AttributeError: 'NoneType' object has no attribute 'group'

SEARCH STACK OVERFLOW

# More on Pattern Matching with Regular Expressions: 

# PIPE - "OR" operation

import re

collegeRegex = re.compile(r'bms(it|ce|wpu|sa|cl)')

s = collegeRegex.search("bmssa has 800 students and bmswpu has 6000 students")

print(s.group())

print(s.group(1))

bmssa

sa

Optional matching with ? (matching 0 or 1 instances)

# More on Pattern Matching with Regular Expressions: 

# ? = optional matching with ?  (matching 0 or 1 instances)

collegeRegex = re.compile(r'Bms( college of)? law')

s = collegeRegex.search("Bms college of law")

print(s.group())

print(s.group(1))

s = collegeRegex.search("Bms law")

print(s.group())

print(s.group(1))

Bms college of law

college of

Bms law

None

Matching 0 or more instances with "*"

# More on Pattern Matching with Regular Expressions: 

# * = matching 0 or more with "*"

collegeRegex = re.compile(r'Bms( college of)* law')

s = collegeRegex.search("Bms law")

print(s.group())

s = collegeRegex.search("Bms college of law")

print(s.group())

s = collegeRegex.search("Bms college of college of college of law")

print(s.group())

print(s.group(1))

Bms law

Bms college of law

Bms college of college of college of law

college of

Matching 1 or more with "+"

# SAME CODE AS ABOVE, BUT + INSTEAD, thus "bms law" gives error

# More on Pattern Matching with Regular Expressions: 

# * = matching 0 or more with "*"
# + = matching 1 or more with "+" 

collegeRegex = re.compile(r'Bms( college of)+ law')

s = collegeRegex.search("Bms college of law")

print(s.group())

s = collegeRegex.search("Bms college of college of college of law")

print(s.group())

Bms college of law

Bms college of college of college of law

s = collegeRegex.search("Bms law")

print(s.group())

---------------------------------------------------------------------------

AttributeError Traceback (most recent call last)

<ipython-input-72-ab867c616037> in <module>()

1 s = collegeRegex.search("Bms law")

----> 2 print(s.group())

AttributeError: 'NoneType' object has no attribute 'group'

SEARCH STACK OVERFLOW

Matching specific number of repetitions using { }

# More on Pattern Matching with Regular Expressions: 

# Matching a specific number of repetitions using {}

bmsRegex = re.compile(r'(Bms){3}')

s = bmsRegex.search("BmsBmsBms")

print(s.group())

# Here matches to ONLY 3 repetitions of the string "Bms"

BmsBmsBms

Greedy and Non-Greedy methods of search:

# GREEDY 

import re

bmsRegex = re.compile(r'(Bms){3,5}')      

# greedy - thus longest match is considered, 

# so if a string with more than 5 "Bms" is searched for, 

# it gives back the first 5 occurrences

s = bmsRegex.search("BmsBmsBms")

print(s.group())

s = bmsRegex.search("BmsBmsBmsBmsBmsBmsBms")

print(s.group())

BmsBmsBms

BmsBmsBmsBmsBms

# NON GREEDY - denoted by '?' after {}

import re

bmsRegex = re.compile(r'(Bms){3,5}?')      

# nongreedy - thus minimum match is considered, 

# so if a string with more than 5 "Bms" is searched for, 

# it gives back the first 3 occurrences only

s = bmsRegex.search("BmsBmsBmsBmsBmsBmsBms")

print(s.group())

BmsBmsBms

# NON GREEDY - denoted by '?' after {}

# giving a larger number first in the braces - is wrong syntax

import re

bmsRegex = re.compile(r'(Bms){8,5}?')      

# nongreedy - thus minimum match is considered, 

# so if a string with more than 5 "Bms" is searched for, 

# it gives back the first 3 occurrences only

s = bmsRegex.search("BmsBmsBmsBmsBmsBmsBms")

print(s.group())

---------------------------------------------------------------------------

error Traceback (most recent call last)

<ipython-input-6-cae2ba967a08> in <module>()

2 # giving a larger number first in the braces - is wrong syntax

3 import re

----> 4 bmsRegex = re.compile(r'(Bms){8,5}?')

5 # nongreedy - thus minimum match is considered,

6 # so if a string with more than 5 "Bms" is searched for,

5 frames
/usr/lib/python3.7/sre_parse.py in _parse(source, state, verbose, nested, first)

633 if max < min:

634 raise source.error("min repeat greater than max repeat",

--> 635 source.tell() - here)

636 else:

637 raise AssertionError("unsupported quantifier %r" % (char,))

error: min repeat greater than max repeat at position 6

SEARCH STACK OVERFLOW

findall() method:
Although in the below code, we have two numbers in the search string, we only get the output for the first occurrence, thus to get all
occurrences of the pattern beyond just the first match is obtained using the findall() method.

import re

lgnumber = re.compile(r'\d\d\d\d-\d\d\d-\d\d\d\d')

s = lgnumber.search("LG numbers are 1800-322-2121 and 1800-223-3211")

print(s.group())

1800-322-2121

lgnumber = re.compile(r'\d\d\d\d-\d\d\d-\d\d\d\d')

s = lgnumber.findall("LG numbers are 1800-322-2121 and 1800-223-3211")

print(s)

['1800-322-2121', '1800-223-3211']

# If there are groups in the regular expression,

# then findall() will return a list of tuples. 

# Each tuple represents a found match, 

# and its items are the matched strings for each group in the regex. 

lgnumber = re.compile(r'(\d\d\d\d)-(\d\d\d)-(\d\d\d\d)')

s = lgnumber.findall("LG numbers are 1800-322-2121 and 1800-223-3211")

print(s)

[('1800', '322', '2121'), ('1800', '223', '3211')]

Character Classes:
Pre-defined character classes :

\d for 0-9
\s for space
\w for alphabets

User-defined character classes can be made by giving the regular expressions in "[ ]"
import re

colRegex = re.compile(r'\d+\s\w+')   

# \d+ (1 or more digits) \s (space) \w (1 or more characters)

s = colRegex.findall("Strength is 3850 bmsit, 11000 bmsce, 680 bmssa")

print(s)

['3850 bmsit', '11000 bmsce', '680 bmssa']

# Making your own character class / Formation of character classes : 

vowelRegex = re.compile(r'[aeiouAEIOU]')

s = vowelRegex.findall("Bmsit is a great college")

print("Vowels in given string: " , s)

consonantRegex = re.compile(r'[^aeiouAEIOU ]')

s = consonantRegex.findall("Bmsit is a great college")

print("Consonants in given string: " , s)

# ^ is used for negation

Vowels in given string: ['i', 'i', 'a', 'e', 'a', 'o', 'e', 'e']

Consonants in given string: ['B', 'm', 's', 't', 's', 'g', 'r', 't', 'c', 'l', 'l', 'g']

Usually we surround the regex search within "if" statements to avoid exceptions, and instead handle them or print a error
message.

import re

searchforbmsitregex = re.compile(r'bmsit')

if searchforbmsitregex.search("Hello bmsit 5th semester students") == None:

  print("No matches found")

else:

  print(searchforbmsitregex.search("Hello bmsit 5th semester students").group())

bmsit

Caret and Dollar sign characters:


The caret symbol within a character class [ ] negates the elements in the class.

But when the caret symbol is used outside the character class, it denotes the starting word. That is, to check whether bmsit is the starting
word of the statement given, we can compile the regex expression using "^bmsit"
The dollar symbol is used to check if the string/sentence ends with the given expression.

import re

searchforbmsit_asStartingWord_regex = re.compile(r'^bmsit')

if searchforbmsit_asStartingWord_regex.search("Hello bmsit 5th semester students") == None:

  print("No matches found")

else:

  print(searchforbmsit_asStartingWord_regex.search("Hello bmsit 5th semester students").group())

No matches found

endsWithNumberRegex = re.compile(r'\d$')

if endsWithNumberRegex.search("Your lucky number is 7") == None:

  print("String doesn't end with a number")

else:

  print("Number found at the end:", endsWithNumberRegex.search("Your lucky number is 7").group())

Number found at the end: 7

The Wildcard character re.DOTALL


The . (or dot) character in a regular expression is called a wildcard and will match any character except for a newline.

re.DOTALL - passes through the wildcard characters

import re

noNewLineRegex = re.compile('.*')

print(noNewLineRegex.search('it\nce\nsa\nwpu\nlaw').group())

print("***************")

noNewLineRegex = re.compile('.*', re.DOTALL)   

# understands that in the searchable string if any wildcharacters occur,

# it'll ignore it and continue searching

print(noNewLineRegex.search('it\nce\nsa\nwpu\nlaw').group())

it

***************

it

ce

sa

wpu

law

Review of regex symbols

* --> Zero or more

+ --> one or more

? --> zero or one

{5} --> matches exactly 5 characters

{5,} --> matches 5 or more characters

{,5} --> matches upto maximum 5 characters

{2,5} --> matches from 2-5 characters/repetitions

\d --> matches a digit

\w --> matches a character (word)

\s --> matches space

\D --> matches anything except a digit

\W --> matches anything except a alphabet

\S --> matches anything except a space

[aeiouAEIOU] --> matches any character from this

character class, here looks for a,e,i,o,u,A,E,I,O,U

[^aeiouAEIOU] --> matches any character NOT PRESENT in

the given character class, here not a vowel.

The . matches any character, except newline characters.
^xyz --> matches the expression IFF it is the starting 
sequence of the given sentence.

xyz$ --> matches the expression IFF it is the ending 

sequence of the given sentence.
Case-Insensitive Matching re.I

# CASE SENSITIVE EXAMPLE: 

import re

colRegex = re.compile(r'bmsit')

print(colRegex.search("BMSIT BMSit bmsit BMsit bmSit bmsit bmsit").group())

# Search will stop after finding the first occurrence of bmsit

print("******************")

print(colRegex.findall("BMSIT BMSit bmsit BMsit bmSit bmsit bmsit"))

# Findall searches for all occurrences of the expression

bmsit

******************

['bmsit', 'bmsit', 'bmsit']

# CASE INSENSITIVE EXAMPLE: 

import re

colRegex = re.compile(r'bmsit',re.I)

print(colRegex.search("BMSIT BMSit bmsit BMsit bmSit bmsit bmsit").group())

# Search will stop after finding the first occurrence of bmsit

print("******************")

print(colRegex.findall("BMSIT BMSit bmsit BMsit bmSit bmsit bmsit"))

# Findall searches for all occurrences of the expression

BMSIT

******************

['BMSIT', 'BMSit', 'bmsit', 'BMsit', 'bmSit', 'bmsit', 'bmsit']

Substituting strings with sub( ) method :

import re

colRegex = re.compile(r'bmsit')

print(colRegex.sub('BMS institute of Technology','bmsit is great')) 

# Replaces every occurrence of 'bmsit' in the second argument of 'sub'

# with first argument.

colRegex1 = re.compile(r'bmsit (\w)\w*')

print(colRegex1.sub(r'\'|***','bmsit cse is great'))

BMS institute of Technology is great

\'|*** is great

Managing complex regexes

import re

emails = ['abc@xyz.com','sss@yahoo.in','zzz@yamaha.biz']

gmailSubstitute = [re.sub(r'@\w+\.(\w+)','@gmail.com',x) for x in emails]

print(gmailSubstitute)

['abc@gmail.com', 'sss@gmail.com', 'zzz@gmail.com']

Verbose :
Python comments written using '#' inside the search pattern is not correct as the re tries to match '#' string and everything after that too.

Thus if something within the search pattern is to be ignored and taken as python comments, we use re.VERBOSE

Similarly with spaces between the comments or the end of the pattern.

# Without VERBOSE
import re

phoneRegex = re.compile(r'\d{3}-\d{3}-\d{4} # this is trying to find LG phone numbers')
print(phoneRegex.findall('Lg\'s India\'s Phone number is 000-521-7878 \nLG\'s SriLanka\'s Phone number is 121-454-6969'))

[]

# With VERBOSE

import re

phoneRegex = re.compile(r'\d{3}-\d{3}-\d{4} # this is trying to find LG phone numbers', re.VERBOSE)
print(phoneRegex.findall('Lg\'s India\'s Phone number is 000-521-7878 \nLG\'s SriLanka\'s Phone number is 121-454-6969'))

['000-521-7878', '121-454-6969']

# Combining re .IGNORECASE, and re .VERBOSE

import re

phoneRegex = re.compile(r'\d{3}-\d{3}-\d{4} # this is trying to find LG phone numbers',re.VERBOSE)

print(phoneRegex.findall('LG India\'s Phone number is 000-521-7878 and LG Srilanka\'s Phone number is 121-454-6969'))

phoneRegex2 = re.compile(r'bmsit # this is how you can combine ignorecase and verbose',re.IGNORECASE | re.VERBOSE)

print(phoneRegex2.findall('bmsit is located at yelahanka and BMSIT is having a strength of 4000 students'))

['000-521-7878', '121-454-6969']

['bmsit', 'BMSIT']

MANAGING COMPLEX REGEXES

phoneRegex = re.compile(r'''(

    (\d{3}|\(\d{3}\))?            # area code

    (\s|-|\.)?                    # separator

    \d{3}                         # first 3 digits

    (\s|-|\.)                     # separator

    \d{4}                         # last 4 digits

    (\s*(ext|x|ext.)\s*\d{2,5})?  # extension

    )''', re.VERBOSE)

Project: Phone Number and Email Address Extractor


The copied text to the program -

Contact Us

No Starch Press, Inc.

245 8th Street

San Francisco, CA 94103 USA

Phone: 800.420.7240 or +1 415.863.9900 (9 a.m. to 5 p.m., M-F, PST)

Fax: +1 415.863.9950

Reach Us by Email

General inquiries: info@nostarch.com

Media requests: media@nostarch.com

Academic requests: academic@nostarch.com (Further information)

Conference and Events: conferences@nostarch.com

Help with your order: info@nostarch.com

!pip install pyperclip

import pyperclip, re

phoneRegex = re.compile(r'''(

    (\d{3}|\(\d{3}\))?                # area code

    (\s|-|\.)?                        # separator

    (\d{3})                           # first 3 digits

    (\s|-|\.)                         # separator

    (\d{4})                           # last 4 digits

    (\s*(ext|x|ext.)\s*(\d{2,5}))?    # extension

    )''', re.VERBOSE)

# Create email regex.

emailRegex = re.compile(r'''(

   [a-zA-Z0-9._%+-]+      # username

   @                      # @ symbol

   [a-zA-Z0-9.-]+         # domain name

    (\.[a-zA-Z]{2,4})       # dot-something

    )''', re.VERBOSE)

# Find matches in clipboard text.
text = str(pyperclip.paste())

matches = []

for groups in phoneRegex.findall(text):

  phoneNum = '-'.join([groups[1], groups[3], groups[5]])

  if groups[8] != '':

      phoneNum += ' x' + groups[8]

  matches.append(phoneNum)

for groups in emailRegex.findall(text):

  matches.append(groups[0])

# Copy results to the clipboard.

if len(matches) > 0:

    pyperclip.copy('\n'.join(matches))

    print('Copied to clipboard:')

    print('\n'.join(matches))

else:

    print('No phone numbers or email addresses found.')

Files and File paths :


Two major packages :

include path
include os

# Converting to path : 

from pathlib import Path

print("Converting to a path -->", Path('bmsitWebsite','resources','views','placement.blade.php'))

Converting to a path --> bmsitWebsite/resources/views/placement.blade.php

# Add some files in a loop

FileList = ['faculty.php','career.php','contact.php']

for file in FileList:

  print(Path(r'D:\bmsitWebsite\resources\views',file))

D:\bmsitWebsite\resources\views/faculty.php

D:\bmsitWebsite\resources\views/career.php

D:\bmsitWebsite\resources\views/contact.php

You might also like