File size: 3,885 Bytes
858bb9d e08c7bc 858bb9d 80eccb2 858bb9d 4fa4779 858bb9d 4fa4779 858bb9d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import streamlit as st
st.title("HEALTHQUERY")
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
AdamW, get_linear_schedule_with_warmup, \
TrainingArguments, BeamScorer, Trainer
import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
RandomSampler, SequentialSampler
#from IPython.display import clear_output
from transformers import BioGptTokenizer, BioGptForCausalLM, TrainerCallback
from transformers import pipeline
#summarizer_bart = pipeline("summarization", model="facebook/bart-large-cnn")
#summarizer_knnkar = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
summarizer_sshle = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
import os
DEBUG = False
INPUT_DIR = 'articles'
USE_APEX = True
APEX_OPT_LEVEL = 'O1'
MODEL = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}
UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training
SPECIAL_TOKENS = { "bos_token": "<|BOS|>",
"eos_token": "<|EOS|>",
"unk_token": "<|UNK|>",
"pad_token": "<|PAD|>",
"sep_token": "<|SEP|>"}
MAXLEN = 256 #{768, 1024, 1280, 1600}
TRAIN_SIZE = 0.8
if USE_APEX:
TRAIN_BATCHSIZE = 16
BATCH_UPDATE = 128
else:
TRAIN_BATCHSIZE = 8
BATCH_UPDATE = 256
EPOCHS = 3
LR = 5e-4
EPS = 1e-8
WARMUP_STEPS = 1e2
SEED = 2020
DEVIDE_BY = 20
os.environ['WANDB_DISABLED'] = 'true'
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained('./')
input_text = st.text_input("Please Provide your text:")
title = input_text
prompt = SPECIAL_TOKENS['bos_token'] + title + SPECIAL_TOKENS['sep_token']
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device =torch.device('cuda' if torch.cuda.is_available() else 'cpu')
generated = generated.to(device)
model.to(device)
model.eval();
# Generate text
if len(input_text)>0:
sample_outputs = model.generate(generated,
do_sample=True,
max_length=MAXLEN,
top_k=10,
top_p=0.7,
temperature=0.5,
repetition_penalty=2.0,
num_return_sequences=1
)
# Initialize an empty list to store the perplexity and text pairs
perplexity_text_pairs = []
for i, sample_output in enumerate(sample_outputs):
text = tokenizer.decode(sample_output,skip_special_tokens=True)
a = len(title)+25
st.write(a)
st.write(("{}: {}\n\n".format(i+1, text[a:])))
# all questions print in above cod
bart_Val=text[a:]
#x=summarizer(bart_Val, max_length=200, min_length=30, do_sample=False)
#st.write('-------Bart summarization-----')
#st.write(x[0]['summary_text'])
#summary=summarizer_knnkar(bart_Val, max_length=200, min_length=30, do_sample=False)
#st.write('-------MEETING_SUMMARY-----')
#st.write(summary[0]['summary_text'])
distl=summarizer_sshle(bart_Val, max_length=200, min_length=30, do_sample=False)
st.write('-------distilbart_cnn_12-6 model -----')
st.write(distl[0]['summary_text'])
else:
st.write('Welcome to GPT2')
# Create a "Regenerate" button
# Display output
|