File size: 7,048 Bytes
ed28876 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# Old_Chunking_Lib.py
#########################################
# Old Chunking Library
# This library is used to handle chunking of text for summarization.
#
####
import logging
####################
# Function List
#
# 1. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]
# 2. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str
# 3. get_chat_completion(messages, model='gpt-4-turbo')
# 4. chunk_on_delimiter(input_string: str, max_tokens: int, delimiter: str) -> List[str]
# 5. combine_chunks_with_no_minimum(chunks: List[str], max_tokens: int, chunk_delimiter="\n\n", header: Optional[str] = None, add_ellipsis_for_overflow=False) -> Tuple[List[str], List[int]]
# 6. rolling_summarize(text: str, detail: float = 0, model: str = 'gpt-4-turbo', additional_instructions: Optional[str] = None, minimum_chunk_size: Optional[int] = 500, chunk_delimiter: str = ".", summarize_recursively=False, verbose=False)
# 7. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]
# 8. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str
#
####################
# Import necessary libraries
import os
from typing import Optional, List, Tuple
#
# Import 3rd party
from openai import OpenAI
from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
#
# Import Local
#
#######################################################################################################################
# Function Definitions
#
######### Words-per-second Chunking #########
def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]:
words = transcript.split()
words_per_chunk = chunk_duration * words_per_second
chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
return chunks
# def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int,
# words_per_second: int) -> str:
# if api_name not in summarizers: # See 'summarizers' dict in the main script
# return f"Unsupported API: {api_name}"
#
# summarizer = summarizers[api_name]
# text = extract_text_from_segments(transcript)
# chunks = chunk_transcript(text, chunk_duration, words_per_second)
#
# summaries = []
# for chunk in chunks:
# if api_name == 'openai':
# # Ensure the correct model and prompt are passed
# summaries.append(summarizer(api_key, chunk, custom_prompt))
# else:
# summaries.append(summarizer(api_key, chunk))
#
# return "\n\n".join(summaries)
################## ####################
######### Token-size Chunking ######### FIXME - OpenAI only currently
# This is dirty and shameful and terrible. It should be replaced with a proper implementation.
# anyways lets get to it....
openai_api_key = "Fake_key" # FIXME
client = OpenAI(api_key=openai_api_key)
# This function chunks a text into smaller pieces based on a maximum token count and a delimiter
def chunk_on_delimiter(input_string: str,
max_tokens: int,
delimiter: str) -> List[str]:
chunks = input_string.split(delimiter)
combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
if dropped_chunk_count > 0:
print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
return combined_chunks
#######################################
######### Words-per-second Chunking #########
# FIXME - WHole section needs to be re-written
def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]:
words = transcript.split()
words_per_chunk = chunk_duration * words_per_second
chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
return chunks
# def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int,
# words_per_second: int) -> str:
# if api_name not in summarizers: # See 'summarizers' dict in the main script
# return f"Unsupported API: {api_name}"
#
# if not transcript:
# logging.error("Empty or None transcript provided to summarize_chunks")
# return "Error: Empty or None transcript provided"
#
# text = extract_text_from_segments(transcript)
# chunks = chunk_transcript(text, chunk_duration, words_per_second)
#
# #FIXME
# custom_prompt = args.custom_prompt
#
# summaries = []
# for chunk in chunks:
# if api_name == 'openai':
# # Ensure the correct model and prompt are passed
# summaries.append(summarize_with_openai(api_key, chunk, custom_prompt))
# elif api_name == 'anthropic':
# summaries.append(summarize_with_cohere(api_key, chunk, anthropic_model, custom_prompt))
# elif api_name == 'cohere':
# summaries.append(summarize_with_anthropic(api_key, chunk, cohere_model, custom_prompt))
# elif api_name == 'groq':
# summaries.append(summarize_with_groq(api_key, chunk, groq_model, custom_prompt))
# elif api_name == 'llama':
# summaries.append(summarize_with_llama(llama_api_IP, chunk, api_key, custom_prompt))
# elif api_name == 'kobold':
# summaries.append(summarize_with_kobold(kobold_api_IP, chunk, api_key, custom_prompt))
# elif api_name == 'ooba':
# summaries.append(summarize_with_oobabooga(ooba_api_IP, chunk, api_key, custom_prompt))
# elif api_name == 'tabbyapi':
# summaries.append(summarize_with_vllm(api_key, tabby_api_IP, chunk, summarize.llm_model, custom_prompt))
# elif api_name == 'local-llm':
# summaries.append(summarize_with_local_llm(chunk, custom_prompt))
# else:
# return f"Unsupported API: {api_name}"
#
# return "\n\n".join(summaries)
# FIXME - WHole section needs to be re-written
def summarize_with_detail_openai(text, detail, verbose=False):
summary_with_detail_variable = rolling_summarize(text, detail=detail, verbose=True)
print(len(openai_tokenize(summary_with_detail_variable)))
return summary_with_detail_variable
def summarize_with_detail_recursive_openai(text, detail, verbose=False):
summary_with_recursive_summarization = rolling_summarize(text, detail=detail, summarize_recursively=True)
print(summary_with_recursive_summarization)
#
#
#################################################################################
|