Spaces:
Sleeping
Sleeping
import gradio as gr | |
import aiimsscrapper | |
import amsscrapper | |
import degruyterscrapper | |
import sciencedirect | |
import sciencedirect_admaths | |
import springerscrapper | |
from urllib.parse import urlparse | |
from sheets import ExcelAutomator | |
from sgoogle import GoogleSheetAutomator | |
import os | |
import random | |
import string | |
import json | |
import crypto | |
import traceback | |
auth = [ | |
(os.getenv("USERNAME1"), os.getenv("PASSWORD1")), | |
(os.getenv("USERNAME2"), os.getenv("PASSWORD2")), | |
(os.getenv("USERNAME3"), os.getenv("PASSWORD3")) | |
] | |
description = """ | |
![Logo](file=files/logo.png) | |
For bug reports or improvements, Contact us [@LogicSpine](https://t.me/LogicSpine) on Telegram. | |
**Usage Instructions:** | |
1. **Single Issue Scraping:** | |
- Provide the issue link in the URL section. | |
- Optionally, specify the desired output file name. | |
2. **Multiple Issues Scraping:** | |
- Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted. | |
- Define the range for volumes, not issues. Ensure you pass the volume range correctly. | |
- To pass the range you must have use () in start of the url something like (to:from)-link/{v}/{i} checkout example below. | |
- Now you can also pass the multiple links too and add range into same as above. | |
- For multiple output file you must have to specific the name in the head of the url like (to:from:output) checkout example's | |
3. **Read this before using google sheet feature** | |
- **IMPORTANT** First make a google drive folder and then gave access to `[email protected]` and `[email protected]` super important. | |
- Next make sure to check for make owner because it will transfer full control over to you you can delete or do anything as you like with the file. | |
- You can also check for live addition of the data if you open the same folder and check for your output file. | |
- You will get the file link in the output 4 you can also access it but only given mail will be able to access it. | |
- Even after creating the file dont remove the access from the google drive folder as are gonna add more file if you like in there | |
- You can get google drive folder id by just go to the drive.google.com -> create new folder -> click on : -> Share -> Enter the email given above both -> Make sure to gave Editor permission -> Click on Send | |
- After this to get the drive folder id click on copy link and then it will look like this https://drive.google.com/drive/folders/folderid?usp=sharing now in this like **folderid** is the folder id. | |
- **IMPORTANT** : After everything is done make sure to accept the ownership which you can do by Click on : -> Share -> Accept ownership and congo now you are the sweet owner of the file do as you like to do. | |
**Note:** | |
- The range should be the volume range, not the issue range. | |
- Some authors may not have a listed address; their profile link will be included in the address section instead. | |
- After progress is completed make sure to click on clear because sometimes notification does't ring | |
""" | |
exmpl = [ | |
["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues", "[email protected]", "asdfasdfasdfasdfasdf", "example", True, True, False], | |
["(37:37:example1)https://www.ams.org/journals/jams/2024-{v}-{i}/home.html?active=allissues", "[email protected]", "asdfasdfasdfasdfasdf", "example1", True, True, False], | |
["https://www.degruyter.com/journal/key/fca/20/2/html", "[email protected]", "asdfasdfasdfasdfasdf", "example2", True, True, False], | |
["(22:23:example3)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "[email protected]", "asdfasdfasdfasdfasdf", "example3", True, True, False], | |
["https://www.aimspress.com/math/article/2024/8/archive-articles", "[email protected]", "asdfasdfasdfasdfasdf", "example4", True, True, False], | |
["(2021:2022:example5)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles", "[email protected]", "asdfasdfasdfasdfasdf", "example5", True, True, False], | |
["https://link.springer.com/journal/208/volumes-and-issues/388-3", "[email protected]", "asdfasdfasdfasdfasdf", "example6", True, True, False], | |
["(388:389:example7)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}", "[email protected]", "asdfasdfasdfasdfasdf", "example7", True, True, False], | |
["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C", "[email protected]", "asdfasdfasdfasdfasdf", "example8", True, True, False], | |
["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6", "[email protected]", "asdfasdfasdfasdfasdf", "example9", True, True, False], | |
["(37:38:example10)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}", "[email protected]", "asdfasdfasdfasdfasdf", "example10", True, True, False], | |
["(37:38:example11)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}\n(388:389:example12)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}\n(2021:2022:example13)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles\n(22:23:example14)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "[email protected]", "asdfasdfasdfasdfasdf", "example11", True, True, False] | |
] | |
stop_work = False | |
def generate_random_filename(length=8) -> str: | |
"""Generate a unique random name for the file without extension | |
Args: | |
length (int, optional): Length of the file name without extension. Defaults to 8. | |
Returns: | |
str: Random file name | |
""" | |
random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length)) | |
return random_string | |
def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str, owner: bool, mail: str, folder_id: str, credit: dict, usegooglesheet: bool, multiUrls: bool): | |
"""Filter the different url and execute the command | |
Args: | |
url (str): Pass the file url here | |
from_range (int): Range start from (Not available) | |
to_range (int): Range end to (Not available) | |
reverse (bool): if you want to run the loop in reverse order (Only aplicable with range) | |
output (str): Output file name will only work for single link | |
owner (bool): If usegoogle sheet is true then will you like to make gmail owner or not | |
mail (str): Pass the gmail which is the owner of the folder | |
folder_id (str): pass the folder id | |
credit (dict): Pass the crenditials here (Google crenditial for original gmail) | |
usegooglesheet (bool): If you want to use google sheets or not | |
multiUrls (bool): If there is multiple url | |
Raises: | |
gr.Error: If any error occur | |
Yields: | |
dict: Yields the dict for live output | |
""" | |
if len(output.strip()) < 1: | |
output = generate_random_filename() | |
if os.path.exists(f"{output}.xlsx"): | |
os.remove(f"{output}.xlsx") | |
url_sch = urlparse(url) | |
domain = url_sch.hostname | |
if usegooglesheet: | |
shet = GoogleSheetAutomator( | |
[ | |
"Name", | |
"Address", | |
"Email" | |
], | |
folder_id, | |
outputfile=output, | |
creds_dict=credit | |
) | |
print(f"Output file name : {output}") | |
sht = ExcelAutomator([ | |
"Name", | |
"Address", | |
"Email" | |
], output) | |
filen = True | |
if "{" in url: | |
links = [] | |
if reverse: | |
for vol in reversed(range(from_range, to_range+1)): | |
links.append(url.format(v=vol, i="{i}")) | |
else: | |
for vol in range(from_range, to_range+1): | |
links.append(url.format(v=vol, i="{i}")) | |
else: | |
links = [url] | |
filen = False | |
try: | |
if domain == "www.ams.org" or domain == "ams.org": | |
for ur in links: | |
isu = 1 | |
while True: | |
if stop_work: | |
break | |
try: | |
if filen: | |
current_url = ur.format(i= (str(isu) if len(str(isu)) > 1 else f"0{isu}")) | |
allLinks = amsscrapper.getlinks(current_url) | |
isu += 1 | |
else: | |
current_url = ur | |
allLinks = amsscrapper.getlinks(current_url) | |
except Exception as e: | |
break | |
yield {"current_url": current_url, "status": "fetching"} | |
for link in allLinks: | |
try: | |
authors = amsscrapper.get_authors(link) | |
except: | |
continue | |
for index, auth in enumerate(authors, start=1): | |
sht.save(auth) | |
if usegooglesheet: | |
shet.save(auth) | |
yield {"author": auth, "index": index} | |
if not filen: | |
break | |
if usegooglesheet: | |
shet.save_to_file() | |
sht.save_to_file() | |
if stop_work: | |
break | |
if owner: | |
if usegooglesheet: | |
shet.transfer_ownership(mail) | |
if usegooglesheet: | |
yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} | |
else: | |
yield {"final_output": sht.save_to_file(), "link" : ""} | |
elif domain == "www.degruyter.com" or domain == "degruyter.com": | |
for ur in links: | |
isu = 1 | |
while True: | |
if stop_work: | |
break | |
try: | |
if filen: | |
current_url = ur.format(i=isu) | |
allLinks = degruyterscrapper.getLinks(current_url) | |
isu += 1 | |
else: | |
current_url = ur | |
allLinks = degruyterscrapper.getLinks(current_url) | |
except Exception as e: | |
break | |
yield {"current_url": current_url, "status": "fetching"} | |
for link in allLinks: | |
try: | |
authors = degruyterscrapper.get_author_details(link) | |
except: | |
continue | |
for index, auth in enumerate(authors, start=1): | |
sht.save(auth) | |
if usegooglesheet: | |
shet.save(auth) | |
yield {"author": auth, "index": index} | |
if not filen: | |
break | |
if usegooglesheet: | |
shet.save_to_file() | |
sht.save_to_file() | |
if stop_work: | |
break | |
if owner: | |
if usegooglesheet: | |
shet.transfer_ownership(mail) | |
if usegooglesheet: | |
yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} | |
else: | |
yield {"final_output": sht.save_to_file(), "link" : ""} | |
elif domain == "www.aimspress.com" or domain == "aimspress.com": | |
for ur in links: | |
isu = 1 | |
while True: | |
if stop_work: | |
break | |
try: | |
if filen: | |
current_url = ur.format(i=isu) | |
allLinks = aiimsscrapper.get_links(current_url) | |
isu += 1 | |
else: | |
current_url = ur | |
allLinks = aiimsscrapper.get_links(current_url) | |
except Exception as e: | |
break | |
yield {"current_url": current_url, "status": "fetching"} | |
for link in allLinks: | |
try: | |
authors = aiimsscrapper.get_author_details(link) | |
except: | |
continue | |
for index, auth in enumerate(authors, start=1): | |
sht.save(auth) | |
if usegooglesheet: | |
shet.save(auth) | |
yield {"author": auth, "index": index} | |
if not filen: | |
break | |
if usegooglesheet: | |
shet.save_to_file() | |
sht.save_to_file() | |
if stop_work: | |
break | |
if owner: | |
if usegooglesheet: | |
shet.transfer_ownership(mail) | |
if usegooglesheet: | |
yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} | |
else: | |
yield {"final_output": sht.save_to_file(), "link" : ""} | |
elif domain == "link.springer.com": | |
for ur in links: | |
isu = 1 | |
while True: | |
if stop_work: | |
break | |
try: | |
if filen: | |
current_url = ur.format(i=isu) | |
allLinks = springerscrapper.get_all_articals_link(current_url) | |
isu += 1 | |
else: | |
current_url = ur | |
allLinks = springerscrapper.get_all_articals_link(current_url) | |
except Exception as e: | |
break | |
yield {"current_url": current_url, "status": "fetching"} | |
for link in allLinks: | |
try: | |
authors = springerscrapper.get_authors(link) | |
except: | |
continue | |
for index, auth in enumerate(authors, start=1): | |
sht.save(auth) | |
if usegooglesheet: | |
shet.save(auth) | |
yield {"author": auth, "index": index} | |
if not filen: | |
break | |
if usegooglesheet: | |
shet.save_to_file() | |
sht.save_to_file() | |
if stop_work: | |
break | |
if owner: | |
if usegooglesheet: | |
shet.transfer_ownership(mail) | |
if usegooglesheet: | |
yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} | |
else: | |
yield {"final_output": sht.save_to_file(), "link" : ""} | |
elif domain == "www.sciencedirect.com": | |
oldtitle = "" | |
for ur in links: | |
isu = 1 | |
while True: | |
if stop_work: | |
break | |
try: | |
if filen: | |
current_url = ur.format(i=isu) | |
authors, title = sciencedirect.run(current_url) | |
if title == oldtitle: | |
break | |
else: | |
oldtitle = title | |
isu += 1 | |
else: | |
current_url = ur | |
authors, title = sciencedirect.run(current_url) | |
if title == oldtitle: | |
break | |
else: | |
oldtitle = title | |
except Exception as e: | |
print(f"Error : {traceback.format_exc()}") | |
break | |
yield {"current_url": current_url, "status": "fetching"} | |
for index, auth in enumerate(authors, start=1): | |
sht.save(auth) | |
if usegooglesheet: | |
shet.save(auth) | |
yield {"author": auth, "index": index} | |
if not filen: | |
break | |
sht.save_to_file() | |
if usegooglesheet: | |
shet.save_to_file() | |
if stop_work: | |
break | |
if owner: | |
if usegooglesheet: | |
shet.transfer_ownership(mail) | |
if usegooglesheet: | |
yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} | |
else: | |
yield {"final_output": sht.save_to_file(), "link" : ""} | |
elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url: | |
oldtitle = "" | |
for ur in links: | |
isu = 1 | |
while True: | |
if stop_work: | |
break | |
try: | |
if filen: | |
current_url = ur.format(i=isu) | |
authors, title = sciencedirect_admaths.run(current_url) | |
if title == oldtitle: | |
break | |
else: | |
oldtitle = title | |
isu += 1 | |
else: | |
current_url = ur | |
authors, title = sciencedirect_admaths.run(current_url) | |
if title == oldtitle: | |
break | |
else: | |
oldtitle = title | |
except Exception as e: | |
break | |
yield {"current_url": current_url, "status": "fetching"} | |
for index, auth in enumerate(authors, start=1): | |
sht.save(auth) | |
if usegooglesheet: | |
shet.save(auth) | |
yield {"author": auth, "index": index} | |
if not filen: | |
break | |
if usegooglesheet: | |
shet.save_to_file() | |
sht.save_to_file() | |
if stop_work: | |
break | |
if owner: | |
if usegooglesheet: | |
shet.transfer_ownership(mail) | |
if usegooglesheet: | |
yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} | |
else: | |
yield {"final_output": sht.save_to_file(), "link" : ""} | |
else: | |
raise gr.Error("Invalid URL. Contact @H4CK3R_5M4CK3R on Telegram") | |
except gr.Error: | |
pass | |
except Exception as e: | |
raise gr.Error("An error occurred. Check your URL or contact @H4CK3R_5M4CK3R on Telegram") | |
def handle_url(Link: str, Gmail: str, FolderId: str, Output: str, MakeOwner:bool=True, UseGoogleSheet:bool=True, Reverse: bool=False): | |
"""Main function to handle the core of gradio app | |
Args: | |
Link (str): Url which needs to be scraped | |
Gmail (str): Gmail of the user if they wants to use google sheets | |
FolderId (str): Google Drive folder id | |
Output (str): Output file name | |
MakeOwner (bool, optional): If user wants to make the given gmail owner or not. Defaults to True. | |
UseGoogleSheet (bool, optional): If user wants to use google sheets or not. Defaults to True. | |
Reverse (bool, optional): If user wants to use the loop in reverse order. Defaults to False. | |
Raises: | |
gr.Error: If any error occur | |
Yields: | |
tuple: Yields tuple is the output | |
""" | |
if len(FolderId) < 2: | |
FolderId = os.getenv("FOLDER_ID") | |
if len(Gmail) < 2: | |
Gmail = os.getenv("GMAIL") | |
authors = [] | |
details = [] | |
final_output = None | |
final_outputs = [] | |
link = None | |
links = [] | |
From_volume = 0 | |
To_Volume = 0 | |
urls = [] | |
if "\n" in Link: | |
urls = Link.split("\n") | |
multiUrls = True | |
else: | |
urls.append(Link) | |
multiUrls = False | |
print(f"URLS : {urls}") | |
for url in urls: | |
if url.startswith("("): | |
From_volume = int(url.split("(", 1)[1].split(":", 1)[0]) | |
To_Volume = int(url.split("(", 1)[1].split(":")[1].split(")", 1)[0]) | |
try: | |
Output = url.split("(", 1)[1].split(":")[2].split(")", 1)[0] | |
except: | |
raise gr.Error(f"No output found for {url} you must need to specify the output") | |
url = url.split("-", 1)[1].strip() | |
try: | |
credit = crypto.decrypt(os.getenv("KEY"), os.getenv("GOOGLE_AUTH_CREDENTIALS")) | |
credit = json.loads(credit) | |
except: | |
pass | |
for _, result in enumerate(filterUrlandRun(url, From_volume, To_Volume, Reverse, Output, MakeOwner, Gmail, FolderId, credit=credit, usegooglesheet=UseGoogleSheet, multiUrls=multiUrls)): | |
if "final_output" in result: | |
if multiUrls == False: | |
final_output = result["final_output"] | |
else: | |
final_outputs.append(result["final_output"]) | |
if multiUrls: | |
link = result["link"] | |
else: | |
links.append(result["link"]) | |
else: | |
if "author" in result: | |
author = result["author"] | |
authors.append(f"Saving Author: {author.get('Name')}\n") | |
if "current_url" in result: | |
current_url = result["current_url"] | |
details.append(f"Scraping: {current_url}\n") | |
authors = authors[-3:] if len(authors) > 3 else authors | |
details = details[-3:] if len(details) > 3 else details | |
if multiUrls: | |
final_output = None | |
if multiUrls: | |
yield "\n".join(authors), "\n".join(details), None, None, None | |
print("\n".join(authors) + "\n".join(details)) | |
else: | |
yield "\n".join(authors), "\n".join(details), final_output if final_output else None, gr.Audio("notification.mp3", autoplay=True) if final_output else None, link if final_output else None | |
print("\n".join(authors) + "\n".join(details)) | |
if multiUrls == True: | |
yield "\n".join(authors), "\n".join(details), final_outputs, gr.Audio("notification.mp3", autoplay=True), links | |
else: | |
yield "\n".join(authors), "\n".join(details), final_output, gr.Audio("notification.mp3", autoplay=True), links | |
interface = gr.Interface( | |
fn=handle_url, | |
inputs=[gr.TextArea(label="Url / Url's", placeholder="Enter the url or multiple urls to scrap"), gr.Textbox(label="Access Gmail (Check Docs)"), gr.Textbox(label="Google Folder ID (Check Docs)"), gr.Textbox(label="Output File Name"), gr.Checkbox(True, label="Make Owner"), gr.Checkbox(True, label="Use Google Sheets"), "checkbox"], | |
outputs=[gr.Markdown("LOGS", height="250px", elem_id="authorscroll"), gr.Markdown("", height="100px", elem_id="authorscroll"), "file", "audio", "textbox"], | |
title="NetMiner", | |
description=description, | |
examples=exmpl, | |
cache_examples=False, | |
thumbnail="logo.png" | |
) | |
interface.launch( | |
share=False, | |
show_api=False, | |
allowed_paths=["files"], | |
auth=auth | |
) | |