Spaces:

stmnk
/

pygen

Runtime error

App Files Files Community

pygen / app.py

stmnk

Update app.py

9013cfa about 2 years ago

raw

history blame

12.2 kB

	import json; import gradio as gr; import requests as req
	from fun_strings import dfs_code, function_code, real_docstring, tree_code, insert_code, display_code, article_string, descr_string

	code_nl = "function for db connection"

	CT5_URL = "https://api-inference.huggingface.co/models/stmnk/codet5-small-code-summarization-python"
	CT5_METHOD = 'POST'
	API_URL = CT5_URL
	headers = {"Authorization": "Bearer api_UhCKXKyqxJOpOcbvrZurQFqmVNZRTtxVfl"}

	def query(payload):
	response = req.post(API_URL, headers=headers, json=payload)
	return response.json()

	function_code = r"""
	def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None,
	batch_size: int = 10_000, duplicate_documents: Optional[str] = None):

	if index and not self.client.indices.exists(index=index):
	self._create_document_index(index)

	if index is None:
	index = self.index
	duplicate_documents = duplicate_documents or self.duplicate_documents
	assert duplicate_documents in self.duplicate_documents_options,
	f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}"

	field_map = self._create_document_field_map()
	document_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents]
	document_objects = self._handle_duplicate_documents(documents=document_objects,
	index=index,
	duplicate_documents=duplicate_documents)
	documents_to_index = []
	for doc in document_objects:
	_doc = {
	"_op_type": "index" if duplicate_documents == 'overwrite' else "create",
	"_index": index,
	**doc.to_dict(field_map=self._create_document_field_map())
	} # type: Dict[str, Any]

	# cast embedding type as ES cannot deal with np.array
	if _doc[self.embedding_field] is not None:
	if type(_doc[self.embedding_field]) == np.ndarray:
	_doc[self.embedding_field] = _doc[self.embedding_field].tolist()

	# rename id for elastic
	_doc["_id"] = str(_doc.pop("id"))

	# don't index query score and empty fields
	_ = _doc.pop("score", None)
	_doc = {k:v for k,v in _doc.items() if v is not None}

	# In order to have a flat structure in elastic + similar behaviour to the other DocumentStores,
	# we "unnest" all value within "meta"
	if "meta" in _doc.keys():
	for k, v in _doc["meta"].items():
	_doc[k] = v
	_doc.pop("meta")
	documents_to_index.append(_doc)

	# Pass batch_size number of documents to bulk
	if len(documents_to_index) % batch_size == 0:
	bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)
	documents_to_index = []

	if documents_to_index:
	bulk(self.client, documents_to_index, request_timeout=300, refresh=self.refresh_type)

	"""

	task_code = f' Summarize Python: {function_code}'
	# task_code = f' Summarize Python: {dfs_code}'

	real_docstring = r"""
	Indexes documents for later queries in Elasticsearch.

	Behaviour if a document with the same ID already exists in ElasticSearch:
	a) (Default) Throw Elastic's standard error message for duplicate IDs.
	b) If `self.update_existing_documents=True` for DocumentStore: Overwrite existing documents.
	(This is only relevant if you pass your own ID when initializing a `Document`.
	If don't set custom IDs for your Documents or just pass a list of dictionaries here,
	they will automatically get UUIDs assigned. See the `Document` class for details)

	:param documents: a list of Python dictionaries or a list of Haystack Document objects.
	For documents as dictionaries, the format is {"content": "<the-actual-text>"}.
	Optionally: Include meta data via {"content": "<the-actual-text>",
	"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
	It can be used for filtering and is accessible in the responses of the Finder.
	Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary
	should be changed to what you have set for self.content_field and self.name_field.
	:param index: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used.
	:param batch_size: Number of documents that are passed to Elasticsearch's bulk function at a time.
	:param duplicate_documents: Handle duplicates document based on parameter options.
	Parameter options : ( 'skip','overwrite','fail')
	skip: Ignore the duplicates documents
	overwrite: Update any existing documents with the same ID when adding documents.
	fail: an error is raised if the document ID of the document being added already
	exists.
	:raises DuplicateDocumentError: Exception trigger on duplicate document
	:return: None
	"""

	tree_code = r"""
	class Tree:
	def __init__(self):
	self.val = None
	self.left = None
	self.right = None
	"""

	insert_code = r"""
	def insert(self, val):
	if self.val:
	if val < self.val:
	if self.left is None:
	self.left = Tree(val)
	else:
	self.left.insert(val)
	elif val > self.val:
	if self.right is None:
	self.right = Tree(val)
	else:
	self.right.insert(val)
	else:
	self.val = val
	"""

	display_code = r"""
	def display_tree(self: Tree, prefix='value: '):
	current_node = self.val

	if self.left:
	self.left.display_tree()

	print(prefix, current_node)

	if self.right:
	self.right.display_tree()

	"""

	def pygen_func(nl_code_intent):
	pass # TODO: generate code PL from intent NL + search in corpus
	# inputs = {'code_nl': code_nl}
	# payload = json.dumps(inputs)
	# prediction = req.request(CT5_METHOD, CT5_URL, data=payload)
	# prediction = req.request(CT5_METHOD, CT5_URL, json=req_data)
	# answer = json.loads(prediction.content.decode("utf-8"))
	# return str(answer)
	# CT5_URL = "https://api-inference.huggingface.co/models/nielsr/codet5-small-code-summarization-ruby"

	def docgen_func(function_code, min_length, max_length, top_k, top_p, temp, repetition_penalty):
	m, M, k, p, t, r = int(min_length), int(max_length), int(top_k), float(top_p/100), float(temp), float(repetition_penalty)
	req_data = {
	"inputs": function_code,
	"parameters": {
	"min_length": m, # (Default: None). Integer to define the minimum length in tokens of the output summary.
	"max_length": M, # (Default: None). Integer to define the maximum length in tokens of the output summary.
	"top_k": k, # (Default: None). Integer to define the top tokens considered within the sample operation to create new text.
	"top_p": p, # (Default: None). Float to define the tokens that are within the sample` operation of text generation.
	# Add tokens in the sample for more probable to least probable until the sum of the probabilities is greater than top_p.
	"temperature": t, # (Default: 1.0). Float (0.0-100.0). The temperature of the sampling operation.
	# 1 means regular sampling, 0 means top_k=1, 100.0 is getting closer to uniform probability.
	"repetition_penalty": r, # (Default: None). Float (0.0-100.0). The more a token is used within generation
	# the more it is penalized to not be picked in successive generation passes.
	"max_time": 80, # (Default: None). Float (0-120.0). The amount of time in seconds that the query should take maximum.
	# Network can cause some overhead so it will be a soft limit.
	},
	"options": {
	"use_gpu": False, # (Default: false). Boolean to use GPU instead of CPU for inference (requires Startup plan at least)
	"use_cache": True, # (Default: true). Boolean. There is a cache layer on the inference API to speedup requests we have already seen. Most models can use those results as is as models are deterministic (meaning the results will be the same anyway). However if you use a non deterministic model, you can set this parameter to prevent the caching mechanism from being used resulting in a real new query.
	"wait_for_model": False, # (Default: false) Boolean. If the model is not ready, wait for it instead of receiving 503. It limits the number of requests required to get your inference done. It is advised to only set this flag to true after receiving a 503 error as it will limit hanging in your application to known places.
	}
	}
	output = query(req_data)
	if type(output) is list:
	return f'"""{output[0]["generated_text"]}"""'
	else:
	msg = str(output)
	if msg == "{'error': 'Model stmnk/codet5-small-code-summarization-python is currently loading', 'estimated_time': 20}":
	return msg + 'Please wait for the model to load and try again'
	return str(output)

	article_string = r"""CodeXGLLUE task definition (and dataset): Code summarization (CodeSearchNet):

	_A model is given the task to generate natural language comments for a programming language code input._

	For further details, see the [CodeXGLUE](https://github.com/microsoft/CodeXGLUE) benchmark dataset and open challenge for code intelligence.
	"""
	descr_string = 'The application takes as input the python code for a function, or a class, and generates a documentation string, or code comment, for it using codeT5 fine tuned for code2text generation. Code to text generation, or code summarization, is a CodeXGLUE generation, or sequence to sequence, downstream task. CodeXGLUE stands for General Language Understanding Evaluation benchmark for code, which includes diversified code intelligence downstream inference tasks and datasets.'

	iface = gr.Interface(
	# pygen_func,
	docgen_func,
	[
	# gr.inputs.Textbox(lines=7, label="Code Intent (NL)", default=task_code),
	gr.inputs.Textbox(lines=10, label="Enter Task + Code in Python (Programming Language syntax, e.g. a Python function or class)", default=task_code),
	gr.inputs.Slider(30, 200, default=100, label="Minimum Length (of the output summary, in tokens)"),
	gr.inputs.Slider(200, 500, default=350, label="Maximum Length (of the output summary, in tokens)"),
	gr.inputs.Slider(1, 7, default=3, step=1, label="Top K (tokens considered within the sample operation to create new text)"),
	gr.inputs.Slider(0, 100, default=80, label="Top P (probability threshold for next tokens in sample of new text, cumulative)"),
	gr.inputs.Slider(0, 100, default=1, label="Temperature (of the sampling operation)"),
	gr.inputs.Slider(0, 100, default=70, label="Repetition Penalty (frequently previously used tokens are downsized)"),
	],
	# gr.outputs.Textbox(label="Code Generated PL"))
	gr.outputs.Textbox(label="Docstring Generated (Natural Language, code comment for documentation)"),
	layout="unaligned",
	title='Generate a documentation string for Python code',
	description=descr_string,
	article=article_string,
	theme='grass',
	examples=[[tree_code,50,200,2,70,10,80],[insert_code,100,250,3,90,20,90],[display_code,150,300,5,100,100,95]],
	verbose=True,
	# show_tips=True
	)

	# iface.launch(share=True) # "share" not allowed in hf spaces? (!?!?)
	iface.launch()