Spaces:

pryanshusharma
/

PrmScrp

Sleeping

App Files Files Community

PrmScrp / app.py

pryanshusharma

Update app.py

95acdc2 verified 6 months ago

raw

history blame contribute delete

23.9 kB

	import gradio as gr
	import aiimsscrapper
	import amsscrapper
	import degruyterscrapper
	import sciencedirect
	import sciencedirect_admaths
	import springerscrapper
	from urllib.parse import urlparse
	from sheets import ExcelAutomator
	from sgoogle import GoogleSheetAutomator
	import os
	import random
	import string
	import json
	import crypto
	import traceback

	auth = [
	(os.getenv("USERNAME1"), os.getenv("PASSWORD1")),
	(os.getenv("USERNAME2"), os.getenv("PASSWORD2")),
	(os.getenv("USERNAME3"), os.getenv("PASSWORD3"))
	]

	description = """
	![Logo](file=files/logo.png)

	For bug reports or improvements, Contact us [@LogicSpine](https://t.me/LogicSpine) on Telegram.

	Usage Instructions:

	1. Single Issue Scraping:
	- Provide the issue link in the URL section.
	- Optionally, specify the desired output file name.

	2. Multiple Issues Scraping:
	- Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted.
	- Define the range for volumes, not issues. Ensure you pass the volume range correctly.
	- To pass the range you must have use () in start of the url something like (to:from)-link/{v}/{i} checkout example below.
	- Now you can also pass the multiple links too and add range into same as above.
	- For multiple output file you must have to specific the name in the head of the url like (to:from:output) checkout example's

	3. Read this before using google sheet feature
	- IMPORTANT First make a google drive folder and then gave access to `[email protected]` and `[email protected]` super important.
	- Next make sure to check for make owner because it will transfer full control over to you you can delete or do anything as you like with the file.
	- You can also check for live addition of the data if you open the same folder and check for your output file.
	- You will get the file link in the output 4 you can also access it but only given mail will be able to access it.
	- Even after creating the file dont remove the access from the google drive folder as are gonna add more file if you like in there
	- You can get google drive folder id by just go to the drive.google.com -> create new folder -> click on : -> Share -> Enter the email given above both -> Make sure to gave Editor permission -> Click on Send
	- After this to get the drive folder id click on copy link and then it will look like this https://drive.google.com/drive/folders/folderid?usp=sharing now in this like folderid is the folder id.
	- IMPORTANT : After everything is done make sure to accept the ownership which you can do by Click on : -> Share -> Accept ownership and congo now you are the sweet owner of the file do as you like to do.


	Note:
	- The range should be the volume range, not the issue range.
	- Some authors may not have a listed address; their profile link will be included in the address section instead.
	- After progress is completed make sure to click on clear because sometimes notification does't ring
	"""

	exmpl = [
	["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues", "[email protected]", "asdfasdfasdfasdfasdf", "example", True, True, False],
	["(37:37:example1)https://www.ams.org/journals/jams/2024-{v}-{i}/home.html?active=allissues", "[email protected]", "asdfasdfasdfasdfasdf", "example1", True, True, False],
	["https://www.degruyter.com/journal/key/fca/20/2/html", "[email protected]", "asdfasdfasdfasdfasdf", "example2", True, True, False],
	["(22:23:example3)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "[email protected]", "asdfasdfasdfasdfasdf", "example3", True, True, False],
	["https://www.aimspress.com/math/article/2024/8/archive-articles", "[email protected]", "asdfasdfasdfasdfasdf", "example4", True, True, False],
	["(2021:2022:example5)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles", "[email protected]", "asdfasdfasdfasdfasdf", "example5", True, True, False],
	["https://link.springer.com/journal/208/volumes-and-issues/388-3", "[email protected]", "asdfasdfasdfasdfasdf", "example6", True, True, False],
	["(388:389:example7)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}", "[email protected]", "asdfasdfasdfasdfasdf", "example7", True, True, False],
	["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C", "[email protected]", "asdfasdfasdfasdfasdf", "example8", True, True, False],
	["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6", "[email protected]", "asdfasdfasdfasdfasdf", "example9", True, True, False],
	["(37:38:example10)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}", "[email protected]", "asdfasdfasdfasdfasdf", "example10", True, True, False],
	["(37:38:example11)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}\n(388:389:example12)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}\n(2021:2022:example13)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles\n(22:23:example14)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "[email protected]", "asdfasdfasdfasdfasdf", "example11", True, True, False]
	]

	stop_work = False

	def generate_random_filename(length=8) -> str:
	"""Generate a unique random name for the file without extension

	Args:
	length (int, optional): Length of the file name without extension. Defaults to 8.

	Returns:
	str: Random file name
	"""
	random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
	return random_string

	def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str, owner: bool, mail: str, folder_id: str, credit: dict, usegooglesheet: bool, multiUrls: bool):
	"""Filter the different url and execute the command

	Args:
	url (str): Pass the file url here
	from_range (int): Range start from (Not available)
	to_range (int): Range end to (Not available)
	reverse (bool): if you want to run the loop in reverse order (Only aplicable with range)
	output (str): Output file name will only work for single link
	owner (bool): If usegoogle sheet is true then will you like to make gmail owner or not
	mail (str): Pass the gmail which is the owner of the folder
	folder_id (str): pass the folder id
	credit (dict): Pass the crenditials here (Google crenditial for original gmail)
	usegooglesheet (bool): If you want to use google sheets or not
	multiUrls (bool): If there is multiple url

	Raises:
	gr.Error: If any error occur

	Yields:
	dict: Yields the dict for live output
	"""
	if len(output.strip()) < 1:
	output = generate_random_filename()
	if os.path.exists(f"{output}.xlsx"):
	os.remove(f"{output}.xlsx")
	url_sch = urlparse(url)
	domain = url_sch.hostname
	if usegooglesheet:
	shet = GoogleSheetAutomator(
	[
	"Name",
	"Address",
	"Email"
	],
	folder_id,
	outputfile=output,
	creds_dict=credit
	)
	print(f"Output file name : {output}")
	sht = ExcelAutomator([
	"Name",
	"Address",
	"Email"
	], output)
	filen = True
	if "{" in url:
	links = []
	if reverse:
	for vol in reversed(range(from_range, to_range+1)):
	links.append(url.format(v=vol, i="{i}"))
	else:
	for vol in range(from_range, to_range+1):
	links.append(url.format(v=vol, i="{i}"))
	else:
	links = [url]
	filen = False
	try:
	if domain == "www.ams.org" or domain == "ams.org":
	for ur in links:
	isu = 1
	while True:
	if stop_work:
	break
	try:
	if filen:
	current_url = ur.format(i= (str(isu) if len(str(isu)) > 1 else f"0{isu}"))
	allLinks = amsscrapper.getlinks(current_url)
	isu += 1
	else:
	current_url = ur
	allLinks = amsscrapper.getlinks(current_url)
	except Exception as e:
	break
	yield {"current_url": current_url, "status": "fetching"}
	for link in allLinks:
	try:
	authors = amsscrapper.get_authors(link)
	except:
	continue
	for index, auth in enumerate(authors, start=1):
	sht.save(auth)
	if usegooglesheet:
	shet.save(auth)
	yield {"author": auth, "index": index}
	if not filen:
	break
	if usegooglesheet:
	shet.save_to_file()
	sht.save_to_file()
	if stop_work:
	break
	if owner:
	if usegooglesheet:
	shet.transfer_ownership(mail)
	if usegooglesheet:
	yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()}
	else:
	yield {"final_output": sht.save_to_file(), "link" : ""}

	elif domain == "www.degruyter.com" or domain == "degruyter.com":
	for ur in links:
	isu = 1
	while True:
	if stop_work:
	break
	try:
	if filen:
	current_url = ur.format(i=isu)
	allLinks = degruyterscrapper.getLinks(current_url)
	isu += 1
	else:
	current_url = ur
	allLinks = degruyterscrapper.getLinks(current_url)
	except Exception as e:
	break
	yield {"current_url": current_url, "status": "fetching"}
	for link in allLinks:
	try:
	authors = degruyterscrapper.get_author_details(link)
	except:
	continue
	for index, auth in enumerate(authors, start=1):
	sht.save(auth)
	if usegooglesheet:
	shet.save(auth)
	yield {"author": auth, "index": index}
	if not filen:
	break
	if usegooglesheet:
	shet.save_to_file()
	sht.save_to_file()
	if stop_work:
	break
	if owner:
	if usegooglesheet:
	shet.transfer_ownership(mail)
	if usegooglesheet:
	yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()}
	else:
	yield {"final_output": sht.save_to_file(), "link" : ""}

	elif domain == "www.aimspress.com" or domain == "aimspress.com":
	for ur in links:
	isu = 1
	while True:
	if stop_work:
	break
	try:
	if filen:
	current_url = ur.format(i=isu)
	allLinks = aiimsscrapper.get_links(current_url)
	isu += 1
	else:
	current_url = ur
	allLinks = aiimsscrapper.get_links(current_url)
	except Exception as e:
	break
	yield {"current_url": current_url, "status": "fetching"}
	for link in allLinks:
	try:
	authors = aiimsscrapper.get_author_details(link)
	except:
	continue
	for index, auth in enumerate(authors, start=1):
	sht.save(auth)
	if usegooglesheet:
	shet.save(auth)
	yield {"author": auth, "index": index}
	if not filen:
	break
	if usegooglesheet:
	shet.save_to_file()
	sht.save_to_file()
	if stop_work:
	break
	if owner:
	if usegooglesheet:
	shet.transfer_ownership(mail)
	if usegooglesheet:
	yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()}
	else:
	yield {"final_output": sht.save_to_file(), "link" : ""}

	elif domain == "link.springer.com":
	for ur in links:
	isu = 1
	while True:
	if stop_work:
	break
	try:
	if filen:
	current_url = ur.format(i=isu)
	allLinks = springerscrapper.get_all_articals_link(current_url)
	isu += 1
	else:
	current_url = ur
	allLinks = springerscrapper.get_all_articals_link(current_url)
	except Exception as e:
	break
	yield {"current_url": current_url, "status": "fetching"}
	for link in allLinks:
	try:
	authors = springerscrapper.get_authors(link)
	except:
	continue
	for index, auth in enumerate(authors, start=1):
	sht.save(auth)
	if usegooglesheet:
	shet.save(auth)
	yield {"author": auth, "index": index}
	if not filen:
	break
	if usegooglesheet:
	shet.save_to_file()
	sht.save_to_file()
	if stop_work:
	break
	if owner:
	if usegooglesheet:
	shet.transfer_ownership(mail)
	if usegooglesheet:
	yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()}
	else:
	yield {"final_output": sht.save_to_file(), "link" : ""}

	elif domain == "www.sciencedirect.com":
	oldtitle = ""
	for ur in links:
	isu = 1
	while True:
	if stop_work:
	break
	try:
	if filen:
	current_url = ur.format(i=isu)
	authors, title = sciencedirect.run(current_url)
	if title == oldtitle:
	break
	else:
	oldtitle = title
	isu += 1
	else:
	current_url = ur
	authors, title = sciencedirect.run(current_url)
	if title == oldtitle:
	break
	else:
	oldtitle = title
	except Exception as e:
	print(f"Error : {traceback.format_exc()}")
	break
	yield {"current_url": current_url, "status": "fetching"}
	for index, auth in enumerate(authors, start=1):
	sht.save(auth)
	if usegooglesheet:
	shet.save(auth)
	yield {"author": auth, "index": index}
	if not filen:
	break
	sht.save_to_file()
	if usegooglesheet:
	shet.save_to_file()
	if stop_work:
	break
	if owner:
	if usegooglesheet:
	shet.transfer_ownership(mail)
	if usegooglesheet:
	yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()}
	else:
	yield {"final_output": sht.save_to_file(), "link" : ""}

	elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url:
	oldtitle = ""
	for ur in links:
	isu = 1
	while True:
	if stop_work:
	break
	try:
	if filen:
	current_url = ur.format(i=isu)
	authors, title = sciencedirect_admaths.run(current_url)
	if title == oldtitle:
	break
	else:
	oldtitle = title
	isu += 1
	else:
	current_url = ur
	authors, title = sciencedirect_admaths.run(current_url)
	if title == oldtitle:
	break
	else:
	oldtitle = title
	except Exception as e:
	break
	yield {"current_url": current_url, "status": "fetching"}
	for index, auth in enumerate(authors, start=1):
	sht.save(auth)
	if usegooglesheet:
	shet.save(auth)
	yield {"author": auth, "index": index}
	if not filen:
	break
	if usegooglesheet:
	shet.save_to_file()
	sht.save_to_file()
	if stop_work:
	break
	if owner:
	if usegooglesheet:
	shet.transfer_ownership(mail)
	if usegooglesheet:
	yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()}
	else:
	yield {"final_output": sht.save_to_file(), "link" : ""}

	else:
	raise gr.Error("Invalid URL. Contact @H4CK3R_5M4CK3R on Telegram")
	except gr.Error:
	pass
	except Exception as e:
	raise gr.Error("An error occurred. Check your URL or contact @H4CK3R_5M4CK3R on Telegram")

	def handle_url(Link: str, Gmail: str, FolderId: str, Output: str, MakeOwner:bool=True, UseGoogleSheet:bool=True, Reverse: bool=False):
	"""Main function to handle the core of gradio app

	Args:
	Link (str): Url which needs to be scraped
	Gmail (str): Gmail of the user if they wants to use google sheets
	FolderId (str): Google Drive folder id
	Output (str): Output file name
	MakeOwner (bool, optional): If user wants to make the given gmail owner or not. Defaults to True.
	UseGoogleSheet (bool, optional): If user wants to use google sheets or not. Defaults to True.
	Reverse (bool, optional): If user wants to use the loop in reverse order. Defaults to False.

	Raises:
	gr.Error: If any error occur

	Yields:
	tuple: Yields tuple is the output
	"""
	if len(FolderId) < 2:
	FolderId = os.getenv("FOLDER_ID")
	if len(Gmail) < 2:
	Gmail = os.getenv("GMAIL")
	authors = []
	details = []
	final_output = None
	final_outputs = []
	link = None
	links = []
	From_volume = 0
	To_Volume = 0
	urls = []
	if "\n" in Link:
	urls = Link.split("\n")
	multiUrls = True
	else:
	urls.append(Link)
	multiUrls = False
	print(f"URLS : {urls}")
	for url in urls:
	if url.startswith("("):
	From_volume = int(url.split("(", 1)[1].split(":", 1)[0])
	To_Volume = int(url.split("(", 1)[1].split(":")[1].split(")", 1)[0])
	try:
	Output = url.split("(", 1)[1].split(":")[2].split(")", 1)[0]
	except:
	raise gr.Error(f"No output found for {url} you must need to specify the output")
	url = url.split("-", 1)[1].strip()
	try:
	credit = crypto.decrypt(os.getenv("KEY"), os.getenv("GOOGLE_AUTH_CREDENTIALS"))
	credit = json.loads(credit)
	except:
	pass

	for _, result in enumerate(filterUrlandRun(url, From_volume, To_Volume, Reverse, Output, MakeOwner, Gmail, FolderId, credit=credit, usegooglesheet=UseGoogleSheet, multiUrls=multiUrls)):
	if "final_output" in result:
	if multiUrls == False:
	final_output = result["final_output"]
	else:
	final_outputs.append(result["final_output"])
	if multiUrls:
	link = result["link"]
	else:
	links.append(result["link"])
	else:
	if "author" in result:
	author = result["author"]
	authors.append(f"Saving Author: {author.get('Name')}\n")
	if "current_url" in result:
	current_url = result["current_url"]
	details.append(f"Scraping: {current_url}\n")
	authors = authors[-3:] if len(authors) > 3 else authors
	details = details[-3:] if len(details) > 3 else details
	if multiUrls:
	final_output = None
	if multiUrls:
	yield "\n".join(authors), "\n".join(details), None, None, None
	print("\n".join(authors) + "\n".join(details))
	else:
	yield "\n".join(authors), "\n".join(details), final_output if final_output else None, gr.Audio("notification.mp3", autoplay=True) if final_output else None, link if final_output else None
	print("\n".join(authors) + "\n".join(details))
	if multiUrls == True:
	yield "\n".join(authors), "\n".join(details), final_outputs, gr.Audio("notification.mp3", autoplay=True), links
	else:
	yield "\n".join(authors), "\n".join(details), final_output, gr.Audio("notification.mp3", autoplay=True), links

	interface = gr.Interface(
	fn=handle_url,
	inputs=[gr.TextArea(label="Url / Url's", placeholder="Enter the url or multiple urls to scrap"), gr.Textbox(label="Access Gmail (Check Docs)"), gr.Textbox(label="Google Folder ID (Check Docs)"), gr.Textbox(label="Output File Name"), gr.Checkbox(True, label="Make Owner"), gr.Checkbox(True, label="Use Google Sheets"), "checkbox"],
	outputs=[gr.Markdown("LOGS", height="250px", elem_id="authorscroll"), gr.Markdown("", height="100px", elem_id="authorscroll"), "file", "audio", "textbox"],
	title="NetMiner",
	description=description,
	examples=exmpl,
	cache_examples=False,
	thumbnail="logo.png"
	)

	interface.launch(
	share=False,
	show_api=False,
	allowed_paths=["files"],
	auth=auth
	)