You are on page 1of 3

import transformers

import yaml
import time
from rouge import Rouge
from flask import Flask, request, jsonify,render_template
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

with open('config.yaml', 'r') as config_file:


config = yaml.safe_load(config_file)

model_name = "google/pegasus-large"
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = PegasusTokenizer.from_pretrained(model_name)

# Function to generate summary for a given chunk of text


def generate_chunk_summary(chunk, tokenizer, model, model_chunk_size,config):
#print("Generating summary for chunk...")
chunk_inputs = tokenizer.encode("summarize: " + chunk, return_tensors="pt",
max_length=model_chunk_size)
num_tokens = len(chunk_inputs[0])

if num_tokens > model_chunk_size:


print(f"Input tokens is greater than model's chunk size
{model_chunk_size}")
else:
print(f"Input tokens is lesser than or equal to model's chunk size
{model_chunk_size}")

summary_ids = model.generate(
chunk_inputs,
length_penalty=2.0,
num_beams=4,
early_stopping=True
)
chunk_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return chunk_summary

# Main function to generate summary


def generate_summary(input_text, model,config):
model_chunk_size = config['chunk_size']

total_summary=""

# Case 1: If input token length is greater than model's chunk size (i.e.,
model's token limit)
if len(tokenizer.encode(input_text)) > model_chunk_size:
# Split the input_text into chunks of model_chunk_size
input_chunks = [input_text[i:i + model_chunk_size] for i in range(0,
len(input_text), model_chunk_size)]

# Generate and append summaries for each chunk


for chunk in input_chunks:
# Generate summary for the current chunk
chunk_summary = generate_chunk_summary( chunk,tokenizer, model,
model_chunk_size,config)
#print("Summary generated for the chunk")
total_summary += chunk_summary

# Check if total_summary exceeds 3000 tokens


if len(tokenizer.encode(total_summary)) >= config['max_tokens_output']:
# Split the total_summary into chunks of model_chunk_size again
print("Summary is still greater than 3000 tokens")
total_summary_chunks = [total_summary[i:i + model_chunk_size] for i in
range(0, len(total_summary), model_chunk_size)]
# Clear total_summary to regenerate it
total_summary = ""
# Generate and append summaries for each chunk of total_summary
for chunk in total_summary_chunks:
# Generate summary for the current chunk
chunk_summary = generate_chunk_summary(chunk, tokenizer, model,
model_chunk_size, config)
total_summary += chunk_summary

# Case 2: If input length is smaller than model chunk size


else:
# Split the input_text into chunks of model_chunk_size

input_chunks = [input_text[i:i + model_chunk_size] for i in range(0,


len(input_text), model_chunk_size)]

# Generate and append summaries for each chunk


for chunk in input_chunks:
# Generate summary for the current chunk
chunk_summary = generate_chunk_summary(chunk, tokenizer, model,
model_chunk_size,config)
total_summary += chunk_summary

# Calculate total input and output tokens for monitoring


total_input_tokens = len(tokenizer.encode(input_text, return_tensors="pt")[0])
total_output_tokens = len(tokenizer.encode(total_summary, return_tensors="pt")
[0])

return total_summary, total_input_tokens, total_output_tokens

if __name__ == '__main__':

input_text = """
For decades, scientists have been engaged in dissecting the origins of human
cancer, and the relative roles of genetic versus epigenetic abnormalities have been
hotly debated. An explosion of data indicating the importance of epigenetic
processes, especially those resulting in the silencing of key regulatory genes, has
led to the realization that genetics and epigenetics cooperate at all stages of
cancer development. Recent advances include the understanding that silencing is
part of global epigenomic alterations in cancer, that pathways relevant to stem
cell growth and differentiation become altered, and the approval of three drugs
that target these defects in cancer patients.
Gene Silencing and Cancer
Epigenetics is defined as heritable changes in gene expression that are not
accompanied by changes in DNA sequence. Gene silencing at the level of chromatin is
necessary for the life of eukaryotic organisms and is particularly important in
orchestrating key biological processes, including differentiation, imprinting, and
silencing of large chromosomal domains such as the X chromosome, over the life span
of female mammals. In many species, silencing can be initiated and maintained
solely by processes involving the covalent modifications of histones and other
chromatin components. Vertebrates, however, have taken advantage of the
heritability of DNA cytosine methylation patterns to add another layer of control
to these processes.
Like most biological processes, silencing can become dysregulated, resulting in
the development of disease states. It can also result in the acquired inactivation
of genes during normal aging. A key property of silencing is that it can spread
over genomic regions in a progressive way, as perhaps best exemplified by position-
effect variegation in Drosophila. It seems to involve the cooperation of multiple
processes, including noncoding RNAs, covalent modifications of chromatin, physical
alterations in nucleosomal positioning, and DNA methylation, among others.
It must be appreciated, as we will outline, that epigenetic abnormalities in
cancer comprise a multitude of aberrations in virtually every component of
chromatin involved in packaging the human genome. Since epigenetic silencing
processes are mitotically heritable, they can play the same roles and undergo the
same selective processes as genetic alterations in the development of a cancer. A
principal tenet of Darwin's hypotheses for the evolution of species is that most
germline mutations are deleterious, or of no functional significance; mutations
give rise to a specific advantage selected for in an evolving population. These
same selective concepts apply for epigenetic events, which can occur at a much more
increased rate compared to mutations in somatic cells. Alterations in gene
expression induced by epigenetic events, which give rise to a cellular growth
advantage, are therefore selected for in the host organ, resulting in the
progressive uncontrolled growth of the tumor. This does not mean that all silenced
genes play direct roles, since it is becoming clear, as we will discuss later, that
whole groups of genes may be inactivated as part of an abnormal “program.”
"""
total_summary, total_input_tokens, total_output_tokens =
generate_summary(input_text, model,config)
print("Summary:", total_summary)
#print("Execution Time:", execution_time, "seconds")
print("Input Tokens:", total_input_tokens)
print("Output Tokens:", total_output_tokens)
# print("Rouge-1 Score:", rouge_1)

You might also like