dutch-questgen / app.py
Michelvh's picture
Update app
92da267
raw
history blame
1.64 kB
import gradio as gr
from transformers import T5ForConditionalGeneration, T5TokenizerFast
import nltk
from nltk import tokenize
checkpoint = "yhavinga/t5-base-dutch"
tokenizer = T5TokenizerFast.from_pretrained(checkpoint)
tokenizer.sep_token = '<sep>'
tokenizer.add_tokens(['<sep>'])
hfmodel = T5ForConditionalGeneration.from_pretrained("Michelvh/t5-end2end-questions-generation-dutch")
def hf_run_model(input_string, **generator_args):
generator_args = {
"max_length": 256,
"num_beams": 4,
"length_penalty": 1.5,
"no_repeat_ngram_size": 3,
"early_stopping": True,
"num_return_sequences": 1,
}
input_string = "generate questions: " + input_string + " </s>"
input_ids = tokenizer.encode(input_string, return_tensors="pt")
res = hfmodel.generate(input_ids, **generator_args)
output = tokenizer.batch_decode(res, skip_special_tokens=True)
output = [item.split("<sep>") for item in output]
return output
def chunkText(text, frameSize=5):
sentences = tokenize.sent_tokenize(text)
frames = []
step_size = frameSize - 1
for index in range(len(sentences) - step_size + 1):
frames.append(" ".join(sentences[index:index + step_size]))
return frames
def flatten(l):
return [item for sublist in l for item in sublist]
def run_model_with_frames(text):
frames = chunkText(text)
result = set()
for frame in frames:
answers = flatten(hf_run_model(frame))
for answer in answers:
result.add(answer.strip())
return result
iface = gr.Interface(fn=run_model_with_frames, inputs="text", outputs="text")
iface.launch()