You are on page 1of 10

Team11_CH.EN.

U4AIE20054_Assignment3

November 15, 2021

0.1 Intelligence of Biological Systems 3


0.2 Assignment 3
0.2.1 Name:Rohith ND
0.2.2 Roll No:20054
0.3 Question 1
[ ]: import time
start = time.time()
seq = "'TGCTGCTG'"
def StringKmerCompo(sequence, kmerlenght):
compo,i =[],0
while i < (len(sequence)-kmerlenght+1):
compo.append(sequence[i:i+kmerlenght])
i+=1
return compo

for e in StringKmerCompo(seq,3):
print (e)
end = time.time()
print(end - start)

'TG
TGC
GCT
CTG
TGC
GCT
CTG
TG'
0.001967906951904297

[ ]: # The divided 3 -mers are given as input and the overlapping patterns are␣
,→returned in the form of adjacency list

def OverlapGraph(reads):
#key a read, values: all suffix = prefix reads

1
matrix = {}
for e in reads:
matrix[e] = []

for e in matrix:
for b in reads:
if e[1:]==b[:-1]:
matrix[e].append(b)

return matrix

seq = "GTGCTATGCT"
result = OverlapGraph(StringKmerCompo(seq,3))
for e in result:
if result[e]:
print (e +" - > "+ str(result[e])[1:-1].replace("'",""))

GTG - > TGC, TGC


TGC - > GCT, GCT
GCT - > CTA
CTA - > TAT
TAT - > ATG
ATG - > TGC, TGC

[ ]: # Finding the Hamitonian Path with edges as


# GTG -- > 0 TGC -- > 1 GCT -- > 2 CTA -- > 3 TAT --> 4 ATG --> 5

class Graph:

# Constructor
def __init__(self, edges, n):

# A list of lists to represent an adjacency list


self.adjList = [[] for _ in range(n)]

# add edges to the undirected graph


for (src, dest) in edges:
self.adjList[src].append(dest)
self.adjList[dest].append(src)

def hamiltonianPaths(graph, v, visited, path, n):

if len(path) == n:
print(path)
return

2
for w in graph.adjList[v]:

if not visited[w]:
visited[w] = True
path.append(w)

hamiltonianPaths(graph, w, visited, path, n)

visited[w] = False
path.pop()

def findHamiltonianPaths(graph, n):

for start in range(n):

path = [start]
visited = [False] * n
visited[start] = True

hamiltonianPaths(graph, start, visited, path, n)

if __name__ == '__main__':

edges = [(0, 1),(1, 2), (2, 3), (3, 4), (4, 5), (5, 1)]
n = 6
graph = Graph(edges, n)

findHamiltonianPaths(graph, n)

[0, 1, 2, 3, 4, 5]
[0, 1, 5, 4, 3, 2]
[2, 3, 4, 5, 1, 0]
[5, 4, 3, 2, 1, 0]

[ ]: string='[0, 1, 2, 3, 4, 5],[0, 1, 5, 4, 3, 2],[2, 3, 4, 5, 1, 0],[5, 4, 3, 2,␣


,→1, 0]'

# GTG -- > 0 TGC -- > 1 GCT -- > 2 CTA -- > 3 TAT --> 4 ATG --> 5

dct={'0':'GTG','1':'TGC','2':'GCT','3':'CTA','4':'TAT','5':'ATG'}
newstr=''
for ch in string:
if ch.isdigit()==True:
dw=dct[ch]
newstr=newstr+dw

3
else:
newstr=newstr+ch
print (newstr)

[GTG, TGC, GCT, CTA, TAT, ATG],[GTG, TGC, ATG, TAT, CTA, GCT],[GCT, CTA, TAT,
ATG, TGC, GTG],[ATG, TAT, CTA, GCT, TGC, GTG]

[ ]: # Genome is represented as a path with edges (rather than nodes)

def StringKmerCompo(sequence, kmerlenght):


compo,i =[],0
while i < (len(sequence)-kmerlenght+1):
compo.append(sequence[i:i+kmerlenght])
i+=1
return compo

def PathGraph(text,k):
#path pattern: [edge,[node, node]]
pathEdge = []
#path of successive k-mers
pathEdge = StringKmerCompo(text,k)
pathGraph = []
for e in pathEdge:
pathGraph.append([e[:-1],e[1:]])

return pathGraph

print (PathGraph("'TGCTGCTG'", 3))

[["'T", 'TG'], ['TG', 'GC'], ['GC', 'CT'], ['CT', 'TG'], ['TG', 'GC'], ['GC',
'CT'], ['CT', 'TG'], ['TG', "G'"]]

[ ]: def StringKmerCompo(sequence, kmerlenght):


compo,i =[],0
while i < (len(sequence)-kmerlenght+1):
compo.append(sequence[i:i+kmerlenght])
i+=1
return compo

def DeBruijnGraph(seq,k):
path = PathGraph(seq,k)
i = 0
for e in path:
for b in path[i+1:]:
if e[0] == b[0] :
e.append(b[1])

4
i += 1

# Just remove duplicate edges


finalPath = []
check = {}
check = set(check)
for e in path:
check.add(e[0])
for e in path:
if e[0] in check:
finalPath.append(e)
check.remove(e[0])

return finalPath

for e in DeBruijnGraph("'TGCTGCTG'",3):
print (e[0] +" -> "+ str(e[1:])[1:-1].replace("'",""))

'T -> TG
TG -> GC, GC, "G"
GC -> CT, CT
CT -> TG, TG

[ ]: # Reconstructing the Genome path again

def GenomePath(reads):

path = reads[0]
for e in reads[1:]:
path += e[len(e)-1:]

return path

DNAstring_open = open("Q1.txt", 'r')


reads = DNAstring_open.readlines()
reads2 = []
for e in reads:
reads2.append(e[:-1])
print(reads2)
print (GenomePath(reads2))

['TG', 'TGC', 'GCT', 'CTG', 'TGC', 'GCT', 'CTG', 'T']


TGCTGCTGT

5
0.4 Question 2
[ ]: seq =␣
,→"GAGCATAATAAGGCGAAAGAGGCAGAACTGCTGCACGACAGTAAAGAGGTGCTTGAGCATATCCTTTCTGTCAAAGAAGCTATTGCTGAAC

print(seq[49:60])
print("The length of sequence is")
print(len(seq))

TGCTTGAGCAT
The length of sequence is
201

[ ]: # Dividing the given sequence into k mers where k = 3


# The input given is the genome sequence and the output is divided k mers␣
,→in stacked order

import time
start = time.time()
seq = "TGCTGCTG"
def StringKmerCompo(sequence, kmerlenght):
compo,i =[],0
while i < (len(sequence)-kmerlenght+1):
compo.append(sequence[i:i+kmerlenght])
i+=1
return compo

for e in StringKmerCompo(seq,3):
print (e)
end = time.time()
print(end - start)

TGC
GCT
CTG
TGC
GCT
CTG
0.0003008842468261719

[ ]: import time
start = time.time()
def OverlapGraph(reads):
#key a read, values: all suffix = prefix reads
matrix = {}
for e in reads:
matrix[e] = []

for e in matrix:

6
for b in reads:
if e[1:]==b[:-1]:
matrix[e].append(b)

return matrix

seq = "GCGGGGCC"
result = OverlapGraph(StringKmerCompo(seq,3))
for e in result:
if result[e]:
print (e +" - > "+ str(result[e])[1:-1].replace("'",""))
end = time.time()
print(end - start)
Time = end - start

GCG - > CGG


CGG - > GGG, GGG, GGC
GGG - > GGG, GGG, GGC
GGC - > GCG, GCC
0.0004038810729980469

[ ]: # Genome is represented as a path with edges (rather than nodes)

def PathGraph(text,k):
#path pattern: [edge,[node, node]]
pathEdge = []
#path of successive k-mers
pathEdge = StringKmerCompo(text,k)
pathGraph = []
for e in pathEdge:
pathGraph.append([e[:-1],e[1:]])

return pathGraph

print (PathGraph("TGCTGCTG", 3))

[['TG', 'GC'], ['GC', 'CT'], ['CT', 'TG'], ['TG', 'GC'], ['GC', 'CT'], ['CT',
'TG']]

[ ]: def DeBruijnGraph(seq,k):
path = PathGraph(seq,k)
i = 0
for e in path:
for b in path[i+1:]:
if e[0] == b[0] :
e.append(b[1])
i += 1

7
# Just remove duplicate edges
finalPath = []
check = {}
check = set(check)
for e in path:
check.add(e[0])
for e in path:
if e[0] in check:
finalPath.append(e)
check.remove(e[0])

return finalPath

for e in DeBruijnGraph("TGCTGCTG",3):
print (e[0] +" -> "+ str(e[1:])[1:-1].replace("'",""))

TG -> GC, GC
GC -> CT, CT
CT -> TG, TG

[ ]: # Reconstructing the Genome path again

def GenomePath(reads):

path = reads[0]
for e in reads[1:]:
path += e[len(e)-1:]

return path

DNAstring_open = open("Q2.txt", 'r')


reads = DNAstring_open.readlines()
reads2 = []
for e in reads:
reads2.append(e[:-1])
print(reads2)
print (GenomePath(reads2))

['TGC', 'GCT', 'CTG', 'TGC', 'GCT', 'CT']


TGCTGCTT

[ ]: # Dividing the given sequence into k mers where k = 5


# The input given is the genome sequence and the output is divided k mers␣
,→in stacked order

8
import time
start = time.time()
seq = "TGCTGCTG"
def StringKmerCompo(sequence, kmerlenght):
compo,i =[],0
while i < (len(sequence)-kmerlenght+1):
compo.append(sequence[i:i+kmerlenght])
i+=1
return compo

for e in StringKmerCompo(seq,5):
print (e)
end = time.time()
print(end - start)

TGCTG
GCTGC
CTGCT
TGCTG
0.0007865428924560547

[ ]: def OverlapGraph(reads):
#key a read, values: all suffix = prefix reads
matrix = {}
for e in reads:
matrix[e] = []

for e in matrix:
for b in reads:
if e[1:]==b[:-1]:
matrix[e].append(b)

return matrix

seq = "TGCTGCTG"
result = OverlapGraph(StringKmerCompo(seq,5))
for e in result:
if result[e]:
print (e +" - > "+ str(result[e])[1:-1].replace("'",""))

TGCTG - > GCTGC


GCTGC - > CTGCT
CTGCT - > TGCTG, TGCTG

[ ]: # Genome is represented as a path with edges (rather than nodes)

9
def PathGraph(text,k):
#path pattern: [edge,[node, node]]
pathEdge = []
#path of successive k-mers
pathEdge = StringKmerCompo(text,k)
pathGraph = []
for e in pathEdge:
pathGraph.append([e[:-1],e[1:]])

return pathGraph

print (PathGraph("TGCTGCTG", 5))

[['TGCT', 'GCTG'], ['GCTG', 'CTGC'], ['CTGC', 'TGCT'], ['TGCT', 'GCTG']]

[ ]:

10

You might also like