You are on page 1of 2

def read_words(fileName):

with open(fileName, 'r') as engw:


l_wor = []
for line in engw:
if len(line) >= 3:
l_wor.append(line.upper().strip())
return l_wor

def read_sequences(fileName):
with open(fileName, 'r') as hprot:

d_pr = {}
d_id = ""
for line in hprot:
if line.startswith(">"):
d_id = line.split("|")[1]
d_pr[d_id] = ""
else:
d_pr[d_id] += line.strip("\n")

return d_pr

def search_words_in_proteome(d_pr, l_wor):


d_fw = {}
for w in l_wor:
count = 0
for i in d_pr:
if w in d_pr[i]:
count += 1
if count != 0:
d_fw[w] = count

return d_fw

def find_most_frequent_word(d_fw):
freq_word = max(d_fw.values())
for word in d_fw:
if d_fw[word] == freq_word:
print(f"{word} is the most frequent word with {freq_word} time ")

def search_words_in_proteom(d_pr, l_wor):


mList = {}
for w in l_wor:
i = 0
for id in d_pr:
if w in d_prc[id]:
i += 1
if i != 0:
mList[w] = i
print(f"{w} found {i} times in sequences")
else:
print(f"{w} not found in any sequence")
return mList

if __name__ == '__main__':

words_file = input("Enter le nom du fichier qui contient word: ")


proteome_file = input("Enter le nom du fichier qui contient proteome sequences:
")

print("le nombre de words trouver: ")


words = read_words(words_file)
print(f"{len(words)} word ")
print("le nombre de sequences trouver:")
sequence = read_sequences(proteome_file)
print(f"{len(sequence)} sequence")
print(" la sequence de O95139 ")
print(sequence.get("O95139"))
print("le mot le plus fréquent:")
find_most_frequent_word(search_words_in_proteome(sequence, words))

You might also like