import gradio as gr import aiimsscrapper import amsscrapper import degruyterscrapper import sciencedirect import sciencedirect_admaths import springerscrapper from urllib.parse import urlparse from sheets import ExcelAutomator from sgoogle import GoogleSheetAutomator import os import random import string import json import crypto auth = [ (os.getenv("USERNAME1"), os.getenv("PASSWORD1")), (os.getenv("USERNAME2"), os.getenv("PASSWORD2")), (os.getenv("USERNAME3"), os.getenv("PASSWORD3")) ] description = """ For bug reports or improvements, contact [@H4CK3R_5M4CK3R](https://t.me/H4CK3R_5M4CK3R) on Telegram. **Usage Instructions:** 1. **Single Issue Scraping:** - Provide the issue link in the URL section. - Optionally, specify the desired output file name. 2. **Multiple Issues Scraping:** - Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted. - Define the range for volumes, not issues. Ensure you pass the volume range correctly. - To pass the range you must have use () in start of the url something like (to:from)-link/{v}/{i} checkout example below. - Now you can also pass the multiple links too and add range into same as above. 3. **Read this before using google sheet feature** - **IMPORTANT** First make a google drive folder and then gave access to `sheettesting@testing-430816.iam.gserviceaccount.com` and `primearchiveofficial@gmail.com` super important. - Next make sure to check for make owner because it will transfer full control over to you you can delete or do anything as you like with the file. - You can also check for live addition of the data if you open the same folder and check for your output file. - You will get the file link in the output 4 you can also access it but only given mail will be able to access it. - Even after creating the file dont remove the access from the google drive folder as are gonna add more file if you like in there - You can get google drive folder id by just go to the drive.google.com -> create new folder -> click on : -> Share -> Enter the email given above both -> Make sure to gave Editor permission -> Click on Send - After this to get the drive folder id click on copy link and then it will look like this https://drive.google.com/drive/folders/folderid?usp=sharing now in this like **folderid** is the folder id. - **IMPORTANT** : After everything is done make sure to accept the ownership which you can do by Click on : -> Share -> Accept ownership and congo now you are the sweet owner of the file do as you like to do. **Note:** - The range should be the volume range, not the issue range. - Some authors may not have a listed address; their profile link will be included in the address section instead. - After progress is completed make sure to click on clear because sometimes notification does't ring If you are copying this space make sure to contact the owner as mention above """ exmpl = [ ["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues", "example@gmail.com", "asdfasdfasdfasdfasdf", "example1", True, True, False], ["https://www.degruyter.com/journal/key/fca/20/2/html", "example@gmail.com", "asdfasdfasdfasdfasdf", "example2", True, True, False], ["(22:23)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "example@gmail.com", "asdfasdfasdfasdfasdf", "example3", True, True, False], ["https://www.aimspress.com/math/article/2024/8/archive-articles", "example@gmail.com", "asdfasdfasdfasdfasdf", "example4", True, True, False], ["(2021:2022)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles", "example@gmail.com", "asdfasdfasdfasdfasdf", "example5", True, True, False], ["https://link.springer.com/journal/208/volumes-and-issues/388-3", "example@gmail.com", "asdfasdfasdfasdfasdf", "example6", True, True, False], ["(388:389)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}", "example@gmail.com", "asdfasdfasdfasdfasdf", "example7", True, True, False], ["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C", "example@gmail.com", "asdfasdfasdfasdfasdf", "example8", True, True, False], ["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6", "example@gmail.com", "asdfasdfasdfasdfasdf", "example9", True, True, False], ["(37:38)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}", "example@gmail.com", "asdfasdfasdfasdfasdf", "example10", True, True, False], ["(37:38)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}\n(388:389)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}\n(2021:2022)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles\n(22:23)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "example@gmail.com", "asdfasdfasdfasdfasdf", "example11", True, True, False] ] stop_work = False def generate_random_filename(length=8): random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length)) return random_string def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str, owner: bool, mail: str, folder_id: str, credit: dict, usegooglesheet: bool, multiUrls: bool): if len(output.strip()) < 1: output = generate_random_filename() if os.path.exists(f"{output}.xlsx"): os.remove(f"{output}.xlsx") url_sch = urlparse(url) domain = url_sch.hostname if usegooglesheet: shet = GoogleSheetAutomator( [ "Name", "Address", "Email" ], folder_id, outputfile=output, creds_dict=credit ) sht = ExcelAutomator([ "Name", "Address", "Email" ], output) filen = True if "{" in url: links = [] if reverse: for vol in reversed(range(from_range, to_range+1)): links.append(url.format(v=vol, i="{i}")) else: for vol in range(from_range, to_range+1): links.append(url.format(v=vol, i="{i}")) else: links = [url] filen = False try: if domain == "www.ams.org" or domain == "ams.org": for ur in links: isu = 1 while True: if stop_work: break try: if filen: current_url = ur.format(i= (str(isu) if len(str(isu)) > 1 else f"0{isu}")) allLinks = amsscrapper.getlinks(current_url) isu += 1 else: current_url = ur allLinks = amsscrapper.getlinks(current_url) except Exception as e: break yield {"current_url": current_url, "status": "fetching"} for link in allLinks: authors = amsscrapper.get_authors(link) for index, auth in enumerate(authors, start=1): sht.save(auth) if usegooglesheet: shet.save(auth) yield {"author": auth, "index": index} if not filen: break if usegooglesheet: shet.save_to_file() sht.save_to_file() if stop_work: break if owner: if usegooglesheet: shet.transfer_ownership(mail) if usegooglesheet: yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} else: yield {"final_output": sht.save_to_file(), "link" : ""} elif domain == "www.degruyter.com" or domain == "degruyter.com": for ur in links: isu = 1 while True: if stop_work: break try: if filen: current_url = ur.format(i=isu) allLinks = degruyterscrapper.getLinks(current_url) isu += 1 else: current_url = ur allLinks = degruyterscrapper.getLinks(current_url) except Exception as e: break yield {"current_url": current_url, "status": "fetching"} for link in allLinks: authors = degruyterscrapper.get_author_details(link) for index, auth in enumerate(authors, start=1): sht.save(auth) if usegooglesheet: shet.save(auth) yield {"author": auth, "index": index} if not filen: break if usegooglesheet: shet.save_to_file() sht.save_to_file() if stop_work: break if owner: if usegooglesheet: shet.transfer_ownership(mail) if usegooglesheet: yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} else: yield {"final_output": sht.save_to_file(), "link" : ""} elif domain == "www.aimspress.com" or domain == "aimspress.com": for ur in links: isu = 1 while True: if stop_work: break try: if filen: current_url = ur.format(i=isu) allLinks = aiimsscrapper.get_links(current_url) isu += 1 else: current_url = ur allLinks = aiimsscrapper.get_links(current_url) except Exception as e: break yield {"current_url": current_url, "status": "fetching"} for link in allLinks: authors = aiimsscrapper.get_author_details(link) for index, auth in enumerate(authors, start=1): sht.save(auth) if usegooglesheet: shet.save(auth) yield {"author": auth, "index": index} if not filen: break if usegooglesheet: shet.save_to_file() sht.save_to_file() if stop_work: break if owner: if usegooglesheet: shet.transfer_ownership(mail) if usegooglesheet: yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} else: yield {"final_output": sht.save_to_file(), "link" : ""} elif domain == "link.springer.com": for ur in links: isu = 1 while True: if stop_work: break try: if filen: current_url = ur.format(i=isu) allLinks = springerscrapper.get_all_articals_link(current_url) isu += 1 else: current_url = ur allLinks = springerscrapper.get_all_articals_link(current_url) except Exception as e: break yield {"current_url": current_url, "status": "fetching"} for link in allLinks: authors = springerscrapper.get_authors(link) for index, auth in enumerate(authors, start=1): sht.save(auth) if usegooglesheet: shet.save(auth) yield {"author": auth, "index": index} if not filen: break if usegooglesheet: shet.save_to_file() sht.save_to_file() if stop_work: break if owner: if usegooglesheet: shet.transfer_ownership(mail) if usegooglesheet: yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} else: yield {"final_output": sht.save_to_file(), "link" : ""} elif domain == "www.sciencedirect.com": oldtitle = "" for ur in links: isu = 1 while True: if stop_work: break try: if filen: current_url = ur.format(i=isu) authors, title = sciencedirect.run(current_url) if title == oldtitle: break else: oldtitle = title isu += 1 else: current_url = ur authors, title = sciencedirect.run(current_url) if title == oldtitle: break else: oldtitle = title except Exception as e: break yield {"current_url": current_url, "status": "fetching"} for index, auth in enumerate(authors, start=1): sht.save(auth) if usegooglesheet: shet.save(auth) yield {"author": auth, "index": index} if not filen: break sht.save_to_file() if usegooglesheet: shet.save_to_file() if stop_work: break if owner: if usegooglesheet: shet.transfer_ownership(mail) if usegooglesheet: yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} else: yield {"final_output": sht.save_to_file(), "link" : ""} elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url: oldtitle = "" for ur in links: isu = 1 while True: if stop_work: break try: if filen: current_url = ur.format(i=isu) authors, title = sciencedirect_admaths.run(current_url) if title == oldtitle: break else: oldtitle = title isu += 1 else: current_url = ur authors, title = sciencedirect_admaths.run(current_url) if title == oldtitle: break else: oldtitle = title except Exception as e: break yield {"current_url": current_url, "status": "fetching"} for index, auth in enumerate(authors, start=1): sht.save(auth) if usegooglesheet: shet.save(auth) yield {"author": auth, "index": index} if not filen: break if usegooglesheet: shet.save_to_file() sht.save_to_file() if stop_work: break if owner: if usegooglesheet: shet.transfer_ownership(mail) if usegooglesheet: yield {"final_output": sht.save_to_file(), "link" : shet.save_to_file()} else: yield {"final_output": sht.save_to_file(), "link" : ""} else: raise gr.Error("Invalid URL. Contact @H4CK3R_5M4CK3R on Telegram") except gr.Error: pass except Exception as e: raise gr.Error("An error occurred. Check your URL or contact @H4CK3R_5M4CK3R on Telegram") def handle_url(Link: str, Gmail: str, FolderId: str, Output: str, MakeOwner:bool=True, UseGoogleSheet:bool=True, Reverse: bool=False): if len(FolderId) < 2: FolderId = os.getenv("FOLDER_ID") if len(Gmail) < 2: Gmail = os.getenv("GMAIL") authors = [] details = [] final_output = None multi_output = None link = None From_volume = 0 To_Volume = 0 urls = [] if "\n" in Link: urls = Link.split("\n") multiUrls = True else: urls.append(Link) multiUrls = False print(f"URLS : {urls}") for url in urls: if url.startswith("("): From_volume = int(url.split("(", 1)[1].split(":")[0]) To_Volume = int(url.split("(", 1)[1].split(":")[1].split(")", 1)[0]) url = url.split("-", 1)[1].strip() try: credit = crypto.decrypt(os.getenv("KEY"), os.getenv("GOOGLE_AUTH_CREDENTIALS")) credit = json.loads(credit) except: pass for _, result in enumerate(filterUrlandRun(url, From_volume, To_Volume, Reverse, Output, MakeOwner, Gmail, FolderId, credit=credit, usegooglesheet=UseGoogleSheet, multiUrls=multiUrls)): if "final_output" in result: if multiUrls == False: final_output = result["final_output"] else: multi_output = result["final_output"] link = result["link"] else: if "author" in result: author = result["author"] authors.append(f"Saving Author: {author.get('Name')}\n") if "current_url" in result: current_url = result["current_url"] details.append(f"Scraping: {current_url}\n") authors = authors[-3:] if len(authors) > 3 else authors details = details[-3:] if len(details) > 3 else details if multiUrls: final_output = None yield "\n".join(authors), "\n".join(details), final_output if final_output else None, gr.Audio("notification.mp3", autoplay=True) if final_output else None, link if multiUrls == True: yield "\n".join(authors), "\n".join(details), multi_output, gr.Audio("notification.mp3", autoplay=True), link interface = gr.Interface( fn=handle_url, inputs=[gr.TextArea(label="Url / Url's", placeholder="Enter the url or multiple urls to scrap"), gr.Textbox(label="Access Gmail (Check Docs)"), gr.Textbox(label="Google Folder ID (Check Docs)"), gr.Textbox(label="Output File Name"), gr.Checkbox(True, label="Make Owner"), gr.Checkbox(True, label="Use Google Sheets"), "checkbox"], outputs=[gr.Markdown("LOGS", height="250px", elem_id="authorscroll"), gr.Markdown("", height="100px", elem_id="authorscroll"), "file", "audio", "textbox"], title="Web Scraper", description=description, examples=exmpl, cache_examples=False ) interface.launch( share=False, show_api=False, auth=auth )