Professional Documents
Culture Documents
U4AIE20054_Assignment3
for e in StringKmerCompo(seq,3):
print (e)
end = time.time()
print(end - start)
'TG
TGC
GCT
CTG
TGC
GCT
CTG
TG'
0.001967906951904297
[ ]: # The divided 3 -mers are given as input and the overlapping patterns are␣
,→returned in the form of adjacency list
def OverlapGraph(reads):
#key a read, values: all suffix = prefix reads
1
matrix = {}
for e in reads:
matrix[e] = []
for e in matrix:
for b in reads:
if e[1:]==b[:-1]:
matrix[e].append(b)
return matrix
seq = "GTGCTATGCT"
result = OverlapGraph(StringKmerCompo(seq,3))
for e in result:
if result[e]:
print (e +" - > "+ str(result[e])[1:-1].replace("'",""))
class Graph:
# Constructor
def __init__(self, edges, n):
if len(path) == n:
print(path)
return
2
for w in graph.adjList[v]:
if not visited[w]:
visited[w] = True
path.append(w)
visited[w] = False
path.pop()
path = [start]
visited = [False] * n
visited[start] = True
if __name__ == '__main__':
edges = [(0, 1),(1, 2), (2, 3), (3, 4), (4, 5), (5, 1)]
n = 6
graph = Graph(edges, n)
findHamiltonianPaths(graph, n)
[0, 1, 2, 3, 4, 5]
[0, 1, 5, 4, 3, 2]
[2, 3, 4, 5, 1, 0]
[5, 4, 3, 2, 1, 0]
# GTG -- > 0 TGC -- > 1 GCT -- > 2 CTA -- > 3 TAT --> 4 ATG --> 5
dct={'0':'GTG','1':'TGC','2':'GCT','3':'CTA','4':'TAT','5':'ATG'}
newstr=''
for ch in string:
if ch.isdigit()==True:
dw=dct[ch]
newstr=newstr+dw
3
else:
newstr=newstr+ch
print (newstr)
[GTG, TGC, GCT, CTA, TAT, ATG],[GTG, TGC, ATG, TAT, CTA, GCT],[GCT, CTA, TAT,
ATG, TGC, GTG],[ATG, TAT, CTA, GCT, TGC, GTG]
def PathGraph(text,k):
#path pattern: [edge,[node, node]]
pathEdge = []
#path of successive k-mers
pathEdge = StringKmerCompo(text,k)
pathGraph = []
for e in pathEdge:
pathGraph.append([e[:-1],e[1:]])
return pathGraph
[["'T", 'TG'], ['TG', 'GC'], ['GC', 'CT'], ['CT', 'TG'], ['TG', 'GC'], ['GC',
'CT'], ['CT', 'TG'], ['TG', "G'"]]
def DeBruijnGraph(seq,k):
path = PathGraph(seq,k)
i = 0
for e in path:
for b in path[i+1:]:
if e[0] == b[0] :
e.append(b[1])
4
i += 1
return finalPath
for e in DeBruijnGraph("'TGCTGCTG'",3):
print (e[0] +" -> "+ str(e[1:])[1:-1].replace("'",""))
'T -> TG
TG -> GC, GC, "G"
GC -> CT, CT
CT -> TG, TG
def GenomePath(reads):
path = reads[0]
for e in reads[1:]:
path += e[len(e)-1:]
return path
5
0.4 Question 2
[ ]: seq =␣
,→"GAGCATAATAAGGCGAAAGAGGCAGAACTGCTGCACGACAGTAAAGAGGTGCTTGAGCATATCCTTTCTGTCAAAGAAGCTATTGCTGAAC
print(seq[49:60])
print("The length of sequence is")
print(len(seq))
TGCTTGAGCAT
The length of sequence is
201
import time
start = time.time()
seq = "TGCTGCTG"
def StringKmerCompo(sequence, kmerlenght):
compo,i =[],0
while i < (len(sequence)-kmerlenght+1):
compo.append(sequence[i:i+kmerlenght])
i+=1
return compo
for e in StringKmerCompo(seq,3):
print (e)
end = time.time()
print(end - start)
TGC
GCT
CTG
TGC
GCT
CTG
0.0003008842468261719
[ ]: import time
start = time.time()
def OverlapGraph(reads):
#key a read, values: all suffix = prefix reads
matrix = {}
for e in reads:
matrix[e] = []
for e in matrix:
6
for b in reads:
if e[1:]==b[:-1]:
matrix[e].append(b)
return matrix
seq = "GCGGGGCC"
result = OverlapGraph(StringKmerCompo(seq,3))
for e in result:
if result[e]:
print (e +" - > "+ str(result[e])[1:-1].replace("'",""))
end = time.time()
print(end - start)
Time = end - start
def PathGraph(text,k):
#path pattern: [edge,[node, node]]
pathEdge = []
#path of successive k-mers
pathEdge = StringKmerCompo(text,k)
pathGraph = []
for e in pathEdge:
pathGraph.append([e[:-1],e[1:]])
return pathGraph
[['TG', 'GC'], ['GC', 'CT'], ['CT', 'TG'], ['TG', 'GC'], ['GC', 'CT'], ['CT',
'TG']]
[ ]: def DeBruijnGraph(seq,k):
path = PathGraph(seq,k)
i = 0
for e in path:
for b in path[i+1:]:
if e[0] == b[0] :
e.append(b[1])
i += 1
7
# Just remove duplicate edges
finalPath = []
check = {}
check = set(check)
for e in path:
check.add(e[0])
for e in path:
if e[0] in check:
finalPath.append(e)
check.remove(e[0])
return finalPath
for e in DeBruijnGraph("TGCTGCTG",3):
print (e[0] +" -> "+ str(e[1:])[1:-1].replace("'",""))
TG -> GC, GC
GC -> CT, CT
CT -> TG, TG
def GenomePath(reads):
path = reads[0]
for e in reads[1:]:
path += e[len(e)-1:]
return path
8
import time
start = time.time()
seq = "TGCTGCTG"
def StringKmerCompo(sequence, kmerlenght):
compo,i =[],0
while i < (len(sequence)-kmerlenght+1):
compo.append(sequence[i:i+kmerlenght])
i+=1
return compo
for e in StringKmerCompo(seq,5):
print (e)
end = time.time()
print(end - start)
TGCTG
GCTGC
CTGCT
TGCTG
0.0007865428924560547
[ ]: def OverlapGraph(reads):
#key a read, values: all suffix = prefix reads
matrix = {}
for e in reads:
matrix[e] = []
for e in matrix:
for b in reads:
if e[1:]==b[:-1]:
matrix[e].append(b)
return matrix
seq = "TGCTGCTG"
result = OverlapGraph(StringKmerCompo(seq,5))
for e in result:
if result[e]:
print (e +" - > "+ str(result[e])[1:-1].replace("'",""))
9
def PathGraph(text,k):
#path pattern: [edge,[node, node]]
pathEdge = []
#path of successive k-mers
pathEdge = StringKmerCompo(text,k)
pathGraph = []
for e in pathEdge:
pathGraph.append([e[:-1],e[1:]])
return pathGraph
[ ]:
10