import streamlit as st st.title("HEALTHQUERY") import os import io import requests import numpy as np import pandas as pd import re import zipfile import random import time import csv import datetime from itertools import compress from collections import Counter, defaultdict from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \ AdamW, get_linear_schedule_with_warmup, \ TrainingArguments, BeamScorer, Trainer import torch from torch.utils.data import Dataset, random_split, DataLoader, \ RandomSampler, SequentialSampler #from IPython.display import clear_output from transformers import BioGptTokenizer, BioGptForCausalLM, TrainerCallback from transformers import pipeline #summarizer_bart = pipeline("summarization", model="facebook/bart-large-cnn") #summarizer_knnkar = pipeline("summarization", model="knkarthick/MEETING_SUMMARY") summarizer_sshle = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") import os DEBUG = False INPUT_DIR = 'articles' USE_APEX = True APEX_OPT_LEVEL = 'O1' MODEL = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl} UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training SPECIAL_TOKENS = { "bos_token": "<|BOS|>", "eos_token": "<|EOS|>", "unk_token": "<|UNK|>", "pad_token": "<|PAD|>", "sep_token": "<|SEP|>"} MAXLEN = 256 #{768, 1024, 1280, 1600} TRAIN_SIZE = 0.8 if USE_APEX: TRAIN_BATCHSIZE = 16 BATCH_UPDATE = 128 else: TRAIN_BATCHSIZE = 8 BATCH_UPDATE = 256 EPOCHS = 3 LR = 5e-4 EPS = 1e-8 WARMUP_STEPS = 1e2 SEED = 2020 DEVIDE_BY = 20 os.environ['WANDB_DISABLED'] = 'true' tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt") model = BioGptForCausalLM.from_pretrained(' alidemo/pytorch_model.bin') input_text = st.text_input("Please Provide your text:") title = input_text prompt = SPECIAL_TOKENS['bos_token'] + title + SPECIAL_TOKENS['sep_token'] generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0) device = torch.device("cuda") generated = generated.to(device) device = torch.device("cuda") model.cuda() model.eval(); from heapq import nsmallest # Generate text if len(input_text)>0: sample_outputs = model.generate(generated, do_sample=True, max_length=MAXLEN, top_k=10, top_p=0.7, temperature=0.5, repetition_penalty=2.0, num_return_sequences=1 ) # Initialize an empty list to store the perplexity and text pairs perplexity_text_pairs = [] for i, sample_output in enumerate(sample_outputs): text = tokenizer.decode(sample_output,skip_special_tokens=True) a = len(title)+25 st.write(a) st.write(("{}: {}\n\n".format(i+1, text[a:]))) # all questions print in above cod bart_Val=text[a:] #x=summarizer(bart_Val, max_length=200, min_length=30, do_sample=False) #st.write('-------Bart summarization-----') #st.write(x[0]['summary_text']) #summary=summarizer_knnkar(bart_Val, max_length=200, min_length=30, do_sample=False) #st.write('-------MEETING_SUMMARY-----') #st.write(summary[0]['summary_text']) distl=summarizer_sshle(bart_Val, max_length=200, min_length=30, do_sample=False) st.write('-------distilbart_cnn_12-6 model -----') st.write(distl[0]['summary_text']) else: st.write('Welcome to GPT2') # Create a "Regenerate" button # Display output