You are on page 1of 2

Name: Khemal Desai

Roll No: I008


Batch: A1
N-gram Method

Code:
from collections import defaultdict
import re

# Define the text corpus


text = 'The quick brown fox jumps over the lazy dog. The quick brown fox likes
to jump over the lazy cat.'

# Clean the text by removing punctuations and converting to lowercase


text = re.sub(r'[^\w\s]', '', text.lower())

# Define the size of the N-gram


n=2

# Generate the N-grams


ngrams = zip(*[text[i:] for i in range(n)])

# Count the frequency of each N-gram


ngram_freq = defaultdict(int)
for ngram in ngrams:
ngram_freq[ngram] += 1
# Define the context for prediction
context = 'the quick'

# Split the context into tokens


context_tokens = context.split()

# Get the last N-1 tokens as the prefix for the N-gram
prefix = tuple(context_tokens[-(n-1):])

# Find all N-grams that match the prefix


matching_ngrams = [(ngram, freq) for ngram, freq in ngram_freq.items() if
ngram[:-1] == prefix]

# Sort the matching N-grams by frequency in descending order


matching_ngrams = sorted(matching_ngrams, key=lambda x: x[1],
reverse=True)

# Print the top predicted next words


for ngram, freq in matching_ngrams[:3]:
next_word = ngram[-1]
print(f'Next word: {next_word}, frequency: {freq}')

You might also like