Spaces:

pryanshusharma
/

PrmScrp

Sleeping

App Files Files Community

H4CK3R-5M4CK3R commited on Jul 26, 2024

Commit

57273d8

1 Parent(s): 3f5414c

Scrpr

Browse files

Files changed (16) hide show

aiimsscrapper.py +64 -0
amsscrapper.py +72 -0
degruyterscrapper.py +61 -0
docs.md +13 -0
install.bat +2 -0
notification.mp3 +0 -0
out.xlsx +0 -0
requirements.txt +7 -0
run.bat +1 -0
sciencedirect.py +210 -0
sciencedirect_admaths.py +207 -0
seleliumdriver.py +40 -0
server.py +219 -0
sheets.py +32 -0
springerscrapper.py +103 -0
wileyscrapper.py +92 -0

aiimsscrapper.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from seleliumdriver import WebScraper
+from bs4 import BeautifulSoup
+import time
+import requests
+def get_links(url: str):
+    browser = WebScraper("firefox", hidden=True)
+    browser.get(url)
+    time.sleep(5) # Important to sleep to continue using this
+    pagehtml = browser.get_html()
+    browser.close_browser()
+    fullPage = BeautifulSoup(pagehtml, "lxml")
+    articals = fullPage.find("div", {"class": "j-archive-article"})
+    output = []
+    for link in articals.findAll("a", {"class" : "tit ng-binding ng-scope"}):
+        output.append("https://www.aimspress.com" + link.get("href"))
+    if len(output) < 1:
+        raise ValueError("Invalid url found")
+    return output
+def save(dt):
+    with open("data.html", "w") as op:
+        op.write(str(dt))
+    print("Done saved")
+def get_author_details(url: str):
+    browser = requests.session()
+    data = browser.get(url)
+    fullPage = BeautifulSoup(data.text, "lxml")
+    authors = fullPage.find("ul", {"class" : "article-author clear"})
+    output = []
+    author_about = fullPage.find("ul", {"class" : "about-author"})
+    authors_about = [d.text.strip() for d in author_about.findAll("div", {"class" : "lostOf"})]
+    for author in authors.findAll("li"):
+        author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip()
+        mail = author.find("a", {"class" : "com-mail"})
+        if mail:
+            mail = mail.get("href").split(":", 1)[1].strip()
+        else:
+            continue
+        try:
+            author_value_tag = author.find("a", {"class" : "com-num"}).get("data-tagval")
+            if "," in author_value_tag:
+                author_value_tag = [int(da) - 1 for da in author_value_tag.split(",")]
+            else:
+                author_value_tag = [int(author_value_tag) - 1]
+            address = None
+            for a in author_value_tag:
+                if address:
+                    address = f"{address} & {authors_about[a]}"
+                else:
+                    address = authors_about[a]
+        except:
+            author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip()
+            mail = author.find("a", {"class" : "com-mail"}).get("title").split(":", 1)[1].strip()
+            address = author.find("a", {"class" : "com-user"}).get("title").split(":", 1)[1].strip()
+        output.append(
+            {
+                "Name" : author_name,
+                "Email" : mail,
+                "Address" : address
+            }
+        )
+    return output

amsscrapper.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import requests
+from bs4 import BeautifulSoup
+from sheets import ExcelAutomator
+def get_headers(data: str) -> dict:
+    """This funciton helps to get the headers form the string to the dict
+    Args:
+        data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
+    Returns:
+        dict: Return the dict or you can say header
+    """
+    data = data.strip()
+    data = data.split("\n")
+    out = {}
+    for dt in data:
+        key = dt.split(":", 1)[0].strip()
+        value = dt.split(":", 1)[1].strip()
+        if value.lower() == "none":
+            value = None
+        elif value.lower() == "true":
+            value = True
+        elif value.lower() == "false":
+            value = False
+        out[key] = value
+    return out
+def getlinks(url: str) -> list:
+    browser = requests.session()
+    # url = f"https://www.ams.org/journals/jams/{year}-{volume}-{issue}/home.html?active=allissues"
+    data = browser.get(url)
+    fullPage = BeautifulSoup(data.text, "lxml")
+    article = fullPage.find("article", {"class" : "contentList"})
+    output = []
+    lnk = url.split('home.html', 1)[0]
+    for allarticle in article.findAll("dl"):
+        output.append(f'{lnk}{allarticle.find("dt").find("a").get("href")}')
+    return output
+def get_authors(url: str):
+    browser = requests.session()
+    data = browser.get(url)
+    fullPage = BeautifulSoup(data.text, "lxml")
+    details = fullPage.find("section", {"id" : "additionalinformation"})
+    email = None
+    address = None
+    author_name = None
+    output = []
+    for author in details.findAll("li"):
+        if email != None and author_name != None and address != None:
+            output.append(
+                {
+                    "Name" : author_name,
+                    "Email" : email,
+                    "Address" : address
+                }
+            )
+            email = None
+            author_name = None
+            address = None
+        if author.find("strong"):
+            author_name = author.text
+        elif "Email:" in author.text:
+            email = author.text.split(":", 1)[1].strip()
+        elif "Affiliation:" in author.text:
+            address = author.text.split(":", 1)[1].strip()
+        if author_name == None:
+            continue
+    return output

degruyterscrapper.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import requests
+from bs4 import BeautifulSoup
+def get_headers(data: str) -> dict:
+    """This funciton helps to get the headers form the string to the dict
+    Args:
+        data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
+    Returns:
+        dict: Return the dict or you can say header
+    """
+    data = data.strip()
+    data = data.split("\n")
+    out = {}
+    for dt in data:
+        key = dt.split(":", 1)[0].strip()
+        value = dt.split(":", 1)[1].strip()
+        if value.lower() == "none":
+            value = None
+        elif value.lower() == "true":
+            value = True
+        elif value.lower() == "false":
+            value = False
+        out[key] = value
+    return out
+def getLinks(url: str) -> list:
+    browser = requests.session()
+    # url = f"https://www.degruyter.com/journal/key/fca/{volume}/{issue}/html"
+    data = browser.get(url)
+    fullPage = BeautifulSoup(data.text, "lxml")
+    links = fullPage.find("div", {"id" : "issue-subject-group-researchpaper"})
+    output = []
+    for link in links.findAll("div", {"class" : "text-container"}):
+        link = link.find("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"}).get("href")
+        output.append(f"https://www.degruyter.com{link}")
+    return output
+def get_author_details(url: str) -> list:
+    browser = requests.session()
+    data = browser.get(url)
+    authors = BeautifulSoup(data.text, "lxml")
+    authors = authors.find("ul", {"class" : "contributors list-unstyled mb-2"})
+    output = []
+    for author in authors.findAll("span", {"class" : "contributor"}):
+        author_name = author.text.strip()
+        author_address = author.find("contributor-popdown").get("affiliations").strip()
+        email = author.find("contributor-popdown").get("email").strip()
+        if len(email.strip()) < 1:
+            continue
+        output.append(
+            {
+                "Name" : author_name,
+                "Email" : email,
+                "Address" : author_address
+            }
+        )
+    return output

docs.md ADDED Viewed

	@@ -0,0 +1,13 @@

+## Steps to install
+- Make sure that python are installed and also pip you can use `python` command to check if python is installed or not.
+- Also make sure to check if pip is installed or not by using the command `pip` if pip is not working then you can use command `python -m pip install -r requirements.txt` or if python command is not working then you can use `python3 -m pip install -r requirements.txt`.
+- If `python3` is working then make sure to edit the file `run.bat` and change `python` to `python3`
+- Then double click on the `install.bat` if `python` command is working fine.
+- Then `pip install webdriver-manager` if this command does not work then use `python -m pip install webdriver-manager` if python command does not work then use python3 insted
+- Now everything is up and running
+## Steps to use
+- To use the script just double click on the `run.bat` file
+- Then open `http://127.0.0.1:7860` this link in the browser and you are good to go

install.bat ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ python -m pip install -r requirements.txt
2	+ echo "Done installed success"

notification.mp3 ADDED Viewed

Binary file (931 kB). View file

out.xlsx ADDED Viewed

Binary file (105 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+bs4
+pygame
+gradio
+openpyxl
+selenium
+requests
+webdriver-manager

run.bat ADDED Viewed

	@@ -0,0 +1 @@


1	+ python server.py

sciencedirect.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import os
+from requests import session
+from bs4 import BeautifulSoup
+import base64
+import urllib.parse
+import traceback
+import json
+from sheets import ExcelAutomator
+req = session()
+def get_headers(data: str) -> dict:
+    data = data.strip()
+    data = data.split("\n")
+    out = {}
+    for dt in data:
+        key = dt.split(":", 1)[0].strip()
+        value = dt.split(":", 1)[1].strip()
+        if value.lower() == "none":
+            value = None
+        elif value.lower() == "true":
+            value = True
+        elif value.lower() == "false":
+            value = False
+        out[key] = value
+    return out
+def get_email_from_encoding(encoded_str):
+    base64_decoded = base64.b64decode(encoded_str).decode('utf-8')
+    url_decoded = urllib.parse.unquote(base64_decoded)
+    decoded_json = json.loads(url_decoded)
+    try:
+        if decoded_json["#name"] == 'e-address':
+            if decoded_json['$']['type'] == 'email':
+                if 'href' in decoded_json['$']:
+                    if 'mailto:' in decoded_json['$']['href']:
+                        return decoded_json['$']['href'].replace("mailto:", "")
+                    else:
+                        return None
+                else:
+                    return decoded_json['_']
+            else:
+                return None
+        else:
+            return None
+    except Exception as e:
+        with open("jsondata.json", "w") as op:
+            json.dump(decoded_json, op)
+        print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------")
+        exit()
+def run(url: str, last_artical_name: str=None) -> tuple:
+    """This function helps to get the detail from the first site
+    Args:
+        volume (int): Pass the volume number
+        issue (int): Pass the issue number
+    Returns:
+        tuple : It includes auth data and page title
+    """
+    headers = """
+User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
+Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
+Accept-Language: en-US,en;q=0.5
+Accept-Encoding: gzip, deflate, br
+Connection: keep-alive
+Upgrade-Insecure-Requests: 1
+Sec-Fetch-Dest: document
+Sec-Fetch-Mode: navigate
+Sec-Fetch-Site: none
+Sec-Fetch-User: ?1
+Sec-GPC: 1
+    """
+    headers = get_headers(headers)
+    # url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}"
+    data = req.get(url, headers=headers)
+    artical_links = []
+    fullpage = BeautifulSoup(str(data.text), "lxml")
+    if fullpage.title.string.strip() == last_artical_name:
+        return None, fullpage.title.string.strip()
+    for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}):
+        artical_links.append("https://www.sciencedirect.com" + link.get("href"))
+    print(f"Total artical found : {len(artical_links)}")
+    n = 1
+    auth = []
+    print(f"Getting all artical from - {fullpage.title.string}")
+    for li in artical_links:
+        print(f"Fetching data of {n} artical")
+        authors = stage_two(li)
+        auth.extend(authors)
+        n += 1
+    return auth, fullpage.title.string.strip()
+def stage_two(url: str) -> list:
+    headers = """
+User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
+Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
+Accept-Language: en-US,en;q=0.5
+Accept-Encoding: gzip, deflate, br
+Connection: keep-alive
+Upgrade-Insecure-Requests: 1
+Sec-Fetch-Dest: document
+Sec-Fetch-Mode: navigate
+Sec-Fetch-Site: none
+Sec-Fetch-User: ?1
+Sec-GPC: 1
+    """
+    headers = get_headers(headers)
+    data = req.get(url, headers=headers)
+    page = BeautifulSoup(data.text, "lxml")
+    json_data = page.find("script", {"type" : "application/json"})
+    json_data = json.loads(json_data.text.strip())
+    authors_detail = []
+    address = json_data['authors']['affiliations']
+    n = 1
+    if len(json_data['authors']['content']) < 1:
+        return authors_detail
+    if not '$$' in json_data['authors']['content'][0]:
+        with open("jsondata.json", "w") as op:
+            json.dump(json_data, op, indent=4)
+        print("ERROR Check jsondata file")
+        exit()
+    address = "Not Found"
+    addr = []
+    authr = []
+    email = None
+    for author in json_data['authors']['content'][0]['$$']:
+        if author['#name'] == 'author':
+            # Its author data
+            author_name = " "
+            for au in author['$$']:
+                if au['#name'] == 'given-name' or au['#name'] == 'name':
+                    author_name = au['_'] + author_name
+                if au['#name'] == 'surname':
+                    author_name = f"{author_name}{au['_']}"
+                if au['#name'] == 'encoded-e-address':
+                    email = get_email_from_encoding(au['__encoded'])
+            if email:
+                authr.append(
+                    {
+                        'Name' : author_name,
+                        'Email' : email
+                    }
+                )
+            else:
+                continue
+        if author['#name'] == 'affiliation':
+            for cor in author['$$']:
+                if '_' in cor:
+                    if address == "Not Found":
+                        address = cor['_']
+                    else:
+                        address = f"{address} {cor['_']}"
+            addr.append(address)
+    output = []
+    for aut in authr:
+        try:
+            address = addr[authr.index(aut)]
+        except:
+            address = "Not Found"
+        if address == "Not Found":
+            address = url
+        output.append(
+            {
+                'Name' : aut['Name'],
+                'Email' : aut['Email'],
+                'Address' : address
+            }
+        )
+    return output
+def get_author_info_specific(vol: int, issue: int) -> list:
+    print(f"Getting detail of volume {vol} and issue {issue}")
+    data, page_title = run(vol, issue)
+    return data
+def get_author_info_in_range(from_vol: int, to_vol: int) -> list:
+    allAuthors = []
+    last_page_title = None
+    for i in range(from_vol, to_vol + 1):
+        print(f"Getting data of vol {i}")
+        d = 1
+        while True:
+            try:
+                data, page_title = run(i, d, last_page_title)
+                if last_page_title == page_title:
+                    print(f"All issues covered of vol {i} changing volume")
+                    print("--------------------------------------------------------------------------")
+                    break
+                else:
+                    last_page_title = page_title
+                allAuthors.extend(data)
+                print(f"Issue {d} data recieved total authors : {len(allAuthors)}")
+            except Exception as e:
+                print(f"ERROR : {traceback.format_exc()}")
+                print(f"All issues covered of vol {i}")
+                print("--------------------------------------------------------------------------")
+                break
+            d += 1
+    return allAuthors

sciencedirect_admaths.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import os
+from requests import session
+from bs4 import BeautifulSoup
+import base64
+import urllib.parse
+import traceback
+import json
+from sheets import ExcelAutomator
+req = session()
+def get_headers(data: str) -> dict:
+    data = data.strip()
+    data = data.split("\n")
+    out = {}
+    for dt in data:
+        key = dt.split(":", 1)[0].strip()
+        value = dt.split(":", 1)[1].strip()
+        if value.lower() == "none":
+            value = None
+        elif value.lower() == "true":
+            value = True
+        elif value.lower() == "false":
+            value = False
+        out[key] = value
+    return out
+def get_email_from_encoding(encoded_str):
+    try:
+        base64_decoded = base64.b64decode(encoded_str).decode('utf-8')
+        url_decoded = urllib.parse.unquote(base64_decoded)
+        decoded_json = json.loads(url_decoded)
+        try:
+            if decoded_json["#name"] == 'e-address':
+                if decoded_json['$']['type'] == 'email':
+                    if 'href' in decoded_json['$']:
+                        if 'mailto:' in decoded_json['$']['href']:
+                            return decoded_json['$']['href'].replace("mailto:", "")
+                        else:
+                            return None
+                    else:
+                        return decoded_json['_']
+                else:
+                    return None
+            else:
+                return None
+        except Exception as e:
+            with open("jsondata.json", "w") as op:
+                json.dump(decoded_json, op)
+            print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------")
+            exit()
+    except:
+        return None
+def run(url: str, last_artical_name: str=None) -> tuple:
+    """This function helps to get the detail from the first site
+    Args:
+        volume (int): Pass the volume number
+        issue (int): Pass the issue number
+    Returns:
+        tuple : It includes auth data and page title
+    """
+    headers = """
+User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
+Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
+Accept-Language: en-US,en;q=0.5
+Accept-Encoding: gzip, deflate, br
+Connection: keep-alive
+Upgrade-Insecure-Requests: 1
+Sec-Fetch-Dest: document
+Sec-Fetch-Mode: navigate
+Sec-Fetch-Site: none
+Sec-Fetch-User: ?1
+Sec-GPC: 1
+    """
+    headers = get_headers(headers)
+    # url = f"https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/{volume}/suppl/C"
+    data = req.get(url, headers=headers)
+    artical_links = []
+    fullpage = BeautifulSoup(str(data.text), "lxml")
+    if fullpage.title.string.strip() == last_artical_name:
+        return None, fullpage.title.string.strip()
+    for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}):
+        artical_links.append("https://www.sciencedirect.com" + link.get("href"))
+    print(f"Total artical found : {len(artical_links)}")
+    n = 1
+    auth = []
+    print(f"Getting all artical from - {fullpage.title.string}")
+    for li in artical_links:
+        print(f"Fetching data of {n} artical")
+        authors = stage_two(li)
+        auth.extend(authors)
+        n += 1
+    return auth, fullpage.title.string.strip()
+def stage_two(url: str) -> list:
+    headers = """
+User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
+Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
+Accept-Language: en-US,en;q=0.5
+Accept-Encoding: gzip, deflate, br
+Connection: keep-alive
+Upgrade-Insecure-Requests: 1
+Sec-Fetch-Dest: document
+Sec-Fetch-Mode: navigate
+Sec-Fetch-Site: none
+Sec-Fetch-User: ?1
+Sec-GPC: 1
+    """
+    headers = get_headers(headers)
+    data = req.get(url, headers=headers)
+    page = BeautifulSoup(data.text, "lxml")
+    json_data = page.find("script", {"type" : "application/json"})
+    json_data = json.loads(json_data.text.strip())
+    authors_detail = []
+    address = json_data['authors']['affiliations']
+    n = 1
+    if len(json_data['authors']['content']) < 1:
+        return authors_detail
+    if not '$$' in json_data['authors']['content'][0]:
+        with open("jsondata.json", "w") as op:
+            json.dump(json_data, op, indent=4)
+        print("ERROR Check jsondata file")
+        exit()
+    address = "Not Found"
+    addr = []
+    authr = []
+    for author in json_data['authors']['content'][0]['$$']:
+        if author['#name'] == 'author':
+            # Its author data
+            author_name = " "
+            for au in author['$$']:
+                if au['#name'] == 'given-name' or au['#name'] == 'name':
+                    author_name = au['_'] + author_name
+                if au['#name'] == 'surname':
+                    author_name = f"{author_name}{au['_']}"
+                if au['#name'] == 'encoded-e-address':
+                    email = get_email_from_encoding(au['__encoded'])
+            if email:
+                authr.append(
+                    {
+                        'Name' : author_name,
+                        'Email' : email
+                    }
+                )
+        if author['#name'] == 'affiliation':
+            for cor in author['$$']:
+                if '_' in cor:
+                    if address == "Not Found":
+                        address = cor['_']
+                    else:
+                        address = f"{address} {cor['_']}"
+            addr.append(address)
+    output = []
+    for aut in authr:
+        try:
+            address = addr[authr.index(aut)]
+        except:
+            address = "Not Found"
+        if address == "Not Found":
+            address = url
+        output.append(
+            {
+                'Name' : aut['Name'],
+                'Email' : aut['Email'],
+                'Address' : address
+            }
+        )
+    return output
+def get_author_info_specific(vol: int) -> list:
+    print(f"Getting detail of volume {vol}")
+    data, page_title = run(vol)
+    return data
+def get_author_info_in_range(from_vol: int, to_vol: int) -> list:
+    allAuthors = []
+    last_page_title = None
+    for i in range(from_vol, to_vol + 1):
+        print(f"Getting data of vol {i}")
+        try:
+            data, page_title = run(i, last_page_title)
+            if last_page_title == page_title:
+                print(f"All issues covered of vol {i} changing volume")
+                print("--------------------------------------------------------------------------")
+                break
+            else:
+                last_page_title = page_title
+            allAuthors.extend(data)
+            print(f"Data recieved total authors : {len(allAuthors)}")
+        except Exception as e:
+            print(f"ERROR : {traceback.format_exc()}")
+            print(f"All issues covered of vol {i}")
+            print("--------------------------------------------------------------------------")
+            break
+    return allAuthors

seleliumdriver.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service as ChromeService
+from selenium.webdriver.firefox.service import Service as FirefoxService
+from webdriver_manager.chrome import ChromeDriverManager
+from webdriver_manager.firefox import GeckoDriverManager
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium.webdriver.firefox.options import Options as FirefoxOptions
+class WebScraper:
+    def __init__(self, browser='chrome', hidden=True):
+        if browser.lower() == 'chrome':
+            options = ChromeOptions()
+            if hidden:
+                options.add_argument('--headless')
+            options.add_argument('--window-size=1920,1200')
+            self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
+        elif browser.lower() == 'firefox':
+            options = FirefoxOptions()
+            if hidden:
+                options.add_argument('--headless')
+            options.add_argument('--window-size=1920,1200')
+            self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=options)
+        else:
+            raise ValueError('Unsupported browser. Only "chrome" and "firefox" are supported.')
+    def get(self, url, wait_time=10):
+        self.driver.get(url)
+        WebDriverWait(self.driver, wait_time).until(
+            EC.presence_of_element_located((By.TAG_NAME, 'body'))
+        )
+    def get_html(self):
+        return self.driver.page_source
+    def close_browser(self):
+        self.driver.quit()

server.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import gradio as gr
+import aiimsscrapper
+import amsscrapper
+import degruyterscrapper
+import sciencedirect
+import sciencedirect_admaths
+import springerscrapper
+import wileyscrapper
+from urllib.parse import urlparse
+import traceback
+from sheets import ExcelAutomator
+import pygame
+import threading
+from datetime import datetime
+import os
+def play_sound():
+    pygame.mixer.init()
+    pygame.mixer.music.load("notification.mp3")  # Ensure this file exists
+    pygame.mixer.music.play()
+def print(data: str):
+    if not os.path.exists("LOGS.txt"):
+        with open("LOGS.txt", "w") as op:
+            op.write(f"{datetime.now().hour}-{datetime.now().minute}-{datetime.now().second}.{datetime.now().microsecond}/{datetime.now().day}-{datetime.now().month}-{datetime.now().year}\n------------------------------------------------------------------\n")
+    with open("LOGS.txt", "a") as op:
+        op.write(f"\n{datetime.now().hour}-{datetime.now().minute}-{datetime.now().second}.{datetime.now().microsecond}/{datetime.now().day}-{datetime.now().month}-{datetime.now().year} -> {data}")
+    gr.Info(data, duration=3)
+def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str) -> str:
+    url_sch = urlparse(url)
+    domain = url_sch.hostname
+    sht = ExcelAutomator([
+        "Name",
+        "Email",
+        "Address"
+    ],
+    output
+    )
+    filen = True
+    if "{" in url:
+        links = []
+        if reverse:
+            for vol in reversed(range(from_range, to_range)):
+                print(url)
+                links.append(url.format(v=vol, i="{i}"))
+        else:
+            for vol in range(from_range, to_range):
+                print(url)
+                links.append(url.format(v=vol, i="{i}"))
+    else:
+        links = [url]
+        filen = False
+    print(f"Total links found {len(links)}")
+    try:
+        if domain == "www.ams.org" or domain == "ams.org":
+            # AMS Scrapper
+            for ur in links:
+                isu = 1
+                while True:
+                    if len(str(isu)) < 2:
+                        isu = f"0{isu}"
+                    try:
+                        if filen:
+                            print(f"Getting data for link {ur.format(i=isu)}")
+                            allLinks = amsscrapper.getlinks(ur.format(i=isu))
+                            isu += 1
+                        else:
+                            print(f"Getting data for link {ur}")
+                            allLinks = amsscrapper.getlinks(ur)
+                    except:
+                        print("Error")
+                        break
+                    for link in allLinks:
+                        authors = amsscrapper.get_authors(link)
+                        for auth in authors:
+                            sht.save(auth)
+                    if filen == False: # If filen is true then dont need to start the loop
+                        break
+                    sht.save_to_file()
+            return sht.save_to_file()
+        elif domain == "www.degruyter.com" or domain == "degruyter.com":
+            # Degruyter scrapper
+            for ur in links:
+                isu = 1
+                while True:
+                    try:
+                        if filen:
+                            print(f"Getting data for link {ur.format(i=isu)}")
+                            allLinks = degruyterscrapper.getLinks(ur.format(i=isu))
+                            isu += 1
+                        else:
+                            print(f"Getting data for link {ur}")
+                            allLinks = degruyterscrapper.getLinks(ur)
+                    except:
+                        break
+                    for link in allLinks:
+                        authors = degruyterscrapper.get_author_details(link)
+                        for auth in authors:
+                            sht.save(auth)
+                    if filen == False: # If filen is true then dont need to start the loop
+                        break
+                    sht.save_to_file()
+            return sht.save_to_file()
+        elif domain == "www.aimspress.com" or domain == "aimspress.com":
+            for ur in links:
+                isu = 1
+                while True:
+                    try:
+                        if filen:
+                            print(f"Getting data for link {ur.format(i=isu)}")
+                            allLinks = aiimsscrapper.get_links(ur.format(i=isu))
+                            isu += 1
+                        else:
+                            print(f"Getting data for link {ur}")
+                            allLinks = aiimsscrapper.get_links(ur)
+                    except:
+                        break
+                    for link in allLinks:
+                        authors = aiimsscrapper.get_author_details(link)
+                        for auth in authors:
+                            sht.save(auth)
+                    if filen == False: # If filen is true then dont need to start the loop
+                        break
+                    sht.save_to_file()
+            return sht.save_to_file()
+        elif domain == "link.springer.com":
+            # Springer scrapping here
+            for ur in links:
+                isu = 1
+                while True:
+                    try:
+                        if filen:
+                            print(f"Getting data for link {ur.format(i=isu)}")
+                            allLinks = springerscrapper.get_all_articals_link(ur.format(i=isu))
+                            isu += 1
+                        else:
+                            print(f"Getting data for link {ur}")
+                            allLinks = springerscrapper.get_all_articals_link(ur)
+                    except:
+                        break
+                    for link in allLinks:
+                        authors = springerscrapper.get_authors(link)
+                        for auth in authors:
+                            sht.save(auth)
+                    if filen == False: # If filen is true then dont need to start the loop
+                        break
+                    sht.save_to_file()
+            return sht.save_to_file()
+        elif domain == "www.sciencedirect.com":
+            # Normail scrapping here
+            for ur in links:
+                isu = 1
+                while True:
+                    try:
+                        if filen:
+                            print(f"Getting data for link {ur.format(i=isu)}")
+                            allLinks = sciencedirect.run(ur.format(i=isu))
+                            isu += 1
+                        else:
+                            print(f"Getting data for link {ur}")
+                            allLinks = sciencedirect.run(ur)
+                    except:
+                        break
+                    for link in allLinks:
+                        authors = sciencedirect.stage_two(link)
+                        for auth in authors:
+                            sht.save(auth)
+                    if filen == False: # If filen is true then dont need to start the loop
+                        break
+                    sht.save_to_file()
+            return sht.save_to_file()
+        elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url:
+            # acta mathematic scientia data here
+            for ur in links:
+                isu = 1
+                while True:
+                    try:
+                        if filen:
+                            print(f"Getting data for link {ur.format(i=isu)}")
+                            allLinks = sciencedirect_admaths.run(ur.format(i=isu))
+                            isu += 1
+                        else:
+                            print(f"Getting data for link {ur}")
+                            allLinks = sciencedirect_admaths.run(ur)
+                    except:
+                        break
+                    for link in allLinks:
+                        authors = sciencedirect_admaths.stage_two(link)
+                        for auth in authors:
+                            sht.save(auth)
+                    if filen == False: # If filen is true then dont need to start the loop
+                        break
+                    sht.save_to_file()
+            return sht.save_to_file()
+        else:
+            raise gr.Error("Invalid url found contact : @H4CK3R_5M4CK3R on telegram")
+    except gr.Error:
+        pass
+    except:
+        with open("ERROR-LOGS.txt", "w") as op:
+            op.write(f"Error {url} : {traceback.format_exc()}")
+        raise gr.Error("Something error has occur check your url or contact @h4ck3r_5m4ck3r on telegram")
+def handle_url(url, From_volume: int, To_Volume: int, Output: str, Reverse: bool):
+    output = filterUrlandRun(url, From_volume, To_Volume, Reverse, Output)
+    threading.Thread(target=play_sound).start()
+    return output
+interface = gr.Interface(
+    fn=handle_url,
+    inputs=["textbox", "number", "number", "textbox","checkbox"],
+    outputs="file",
+    title="Web Scrapper",
+    description="Enter a URL and download a generated XLSX file."
+)
+interface.launch()

sheets.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import openpyxl
+import os
+class ExcelAutomator:
+    def __init__(self, name: list, output: str):
+        self.columns = name
+        self.output = output if output.endswith(".xlsx") else f"{output}.xlsx"
+        if os.path.exists(self.output):
+            self.workbook = openpyxl.load_workbook(self.output)
+            self.sheet = self.workbook.active
+        else:
+            self.workbook = openpyxl.Workbook()
+            self.sheet = self.workbook.active
+            for col_num, column_name in enumerate(self.columns, 1):
+                self.sheet.cell(row=1, column=col_num, value=column_name)
+    def save(self, data_dict):
+        """
+        Save a new row of data to the Excel file.
+        :param data_dict: Dictionary with keys as column names and values as the data to save.
+        """
+        row_data = [data_dict.get(column, None) for column in self.columns]
+        self.sheet.append(row_data)
+    def save_to_file(self):
+        """
+        Save the workbook to a file.
+        """
+        self.workbook.save(self.output)
+        return self.output

springerscrapper.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import requests
+from bs4 import BeautifulSoup
+import json
+from sheets import ExcelAutomator
+def get_headers(data: str) -> dict:
+    """This funciton helps to get the headers form the string to the dict
+    Args:
+        data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
+    Returns:
+        dict: Return the dict or you can say header
+    """
+    data = data.strip()
+    data = data.split("\n")
+    out = {}
+    for dt in data:
+        key = dt.split(":", 1)[0].strip()
+        value = dt.split(":", 1)[1].strip()
+        if value.lower() == "none":
+            value = None
+        elif value.lower() == "true":
+            value = True
+        elif value.lower() == "false":
+            value = False
+        out[key] = value
+    return out
+def get_all_articals_link(url: str) -> dict:
+    browser = requests.session()
+    # url = f"https://link.springer.com/journal/208/volumes-and-issues/{volume}-{issue}"
+    headers = """
+    User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
+    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
+    Accept-Language: en-US,en;q=0.5
+    Accept-Encoding: gzip, deflate, br
+    Referer: https://link.springer.com/journal/208/volumes-and-issues
+    Alt-Used: link.springer.com
+    Connection: keep-alive
+    Upgrade-Insecure-Requests: 1
+    Sec-Fetch-Dest: document
+    Sec-Fetch-Mode: navigate
+    Sec-Fetch-Site: same-origin
+    Sec-Fetch-User: ?1
+    Sec-GPC: 1
+    TE: trailers
+    """
+    head = get_headers(headers)
+    data = browser.get(url, headers=head)
+    fullpage = BeautifulSoup(data.text, "lxml")
+    orderlist = fullpage.find("ol", {"class" : "u-list-reset"})
+    allLinks = []
+    for dt in orderlist.findAll("li"):
+        if not dt.find("a"):
+            continue
+        allLinks.append(dt.find("a").get("href"))
+    return allLinks
+def get_authors(url: str) -> list:
+    browser = requests.session()
+    headers = """
+User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
+Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
+Accept-Language: en-US,en;q=0.5
+Accept-Encoding: gzip, deflate, br
+Referer: https://link.springer.com/journal/208/volumes-and-issues
+Alt-Used: link.springer.com
+Connection: keep-alive
+Upgrade-Insecure-Requests: 1
+Sec-Fetch-Dest: document
+Sec-Fetch-Mode: navigate
+Sec-Fetch-Site: same-origin
+Sec-Fetch-User: ?1
+Sec-GPC: 1
+TE: trailers
+    """
+    head = get_headers(headers)
+    data = browser.get(url, headers=head)
+    main_page = BeautifulSoup(data.text, "lxml")
+    json_data = main_page.find("script", {"type" : "application/ld+json"}).text
+    json_data = json.loads(json_data)
+    authors = json_data['mainEntity']['author']
+    output = []
+    for author in authors:
+        if 'email' in author:
+            output.append(
+                {
+                    "Name" : author['name'],
+                    'Email' : author['email'],
+                    'Address' : ", ".join(item['address']['name'] for item in author['affiliation'] if 'address' in item and 'name' in item['address'])
+                }
+            )
+    return output

wileyscrapper.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import requests
+from bs4 import BeautifulSoup
+from sheets import ExcelAutomator
+from seleliumdriver import WebScraper
+browser = requests.session()
+def save(data:str):
+    with open("data.html", "w") as op:
+        op.write(str(data))
+def get_headers(data: str) -> dict:
+    """This funciton helps to get the headers form the string to the dict
+    Args:
+        data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
+    Returns:
+        dict: Return the dict or you can say header
+    """
+    data = data.strip()
+    data = data.split("\n")
+    out = {}
+    for dt in data:
+        key = dt.split(":", 1)[0].strip()
+        value = dt.split(":", 1)[1].strip()
+        if value.lower() == "none":
+            value = None
+        elif value.lower() == "true":
+            value = True
+        elif value.lower() == "false":
+            value = False
+        out[key] = value
+    return out
+def get_links(url: str, issue: int) -> list:
+    headers = """
+    User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
+    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
+    Accept-Language: en-US,en;q=0.5
+    Accept-Encoding: gzip, deflate, br
+    Alt-Used: onlinelibrary.wiley.com
+    Connection: keep-alive
+    Cookie: MAID=5SmfjugKsbanANmqY7QFTQ==; MACHINE_LAST_SEEN=2024-07-13T00%3A17%3A54.434-07%3A00; osano_consentmanager_uuid=a09cf48c-a316-44da-a630-b284fe618561; osano_consentmanager=wtFVO73sxrqPK1QjgWvz2PRznZ_IuLc6ARcv2t0_pFtepafXHZgrg-S478uJo9AvbIWsu3sbpgmvaCKL_zNkJQZzpvdHNzGX6NQQ6cwL_c09p-7H9gmYq7lFeOBlGJxYVbwgVIa5TZDqtpLjvla4iYf-rEyPZ0zXi8nZVVY5aCRrKBkWiIYkwIWvpeVeBLepXirD0RkYCGg-O2PWE000CQi4kWVXGTOkNMFqFOSQ-tthQqpC7pvT9AeCAodC2z6CeM6tTjz3TNmp8sTxikwwT4jzZ9HRy76gqQjb8g==; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_identity=CiY4MDg5NTE5MTAxMTg2NDkzMzQzMTI2OTY5MjMzMTU3OTYwODc1N1ITCM6izY3mMRABGAEqBElORDEwAPAB5cnS14oy; Hm_lvt_953dddc9c7bea72022e3bd3ba7782e7b=1720765103,1720855080; AMCV_1B6E34B85282A0AC0A490D44%40AdobeOrg=MCMID|80895191011864933431269692331579608757; JSESSIONID=90BFBDCF8874DBB2B708D37ACC4172DD; __cf_bm=FgCtBcokrG75eoj6.nqj2jTcbcl.vtSPGArq4iAYwYk-1720855074-1.0.1.1-OCKWcrDvKtyaaNLld1aBjaFFwZLoLHauSzJ0NEZFn1JLYK4G4lqmaTMEE50PAzZCReTc13aRgLNyLlqu6JOllleWjBRMQr5vc3YjxJ4kdPs; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_cluster=ind1; cf_clearance=B0r0CEgCWVP2M5CKvRhRTvIW8MyIJM2WBVS14NsHxxE-1720855079-1.0.1.1-CqrZHd19zoe3QCemtBtqxsHiVLXILmnPkb9RjSG2yHndhy.XZzt14jGpjymiEPzjA0nFP7xw1hU6xsXIz6UDSg; Hm_lpvt_953dddc9c7bea72022e3bd3ba7782e7b=1720855160; HMACCOUNT=C851A9F6625CC221; randomizeUser=0.5543043437474287
+    Upgrade-Insecure-Requests: 1
+    Sec-Fetch-Dest: document
+    Sec-Fetch-Mode: navigate
+    Sec-Fetch-Site: none
+    Sec-Fetch-User: ?1
+    Sec-GPC: 1
+    TE: trailers
+    """
+    # url = f"https://onlinelibrary.wiley.com/toc/14679590/{year}/{volume}/{issue}"
+    data = browser.get(url, headers=get_headers(headers))
+    fullPage = BeautifulSoup(data.text, "lxml")
+    issuelinks = []
+    for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}):
+        issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}')
+    return issuelinks
+def decode_email(encoded_str):
+    key = int(encoded_str[:2], 16)
+    encoded_bytes = bytes.fromhex(encoded_str[2:])
+    decoded_email = ''.join(chr(byte ^ key) for byte in encoded_bytes)
+    return decoded_email
+def get_details(url: str):
+    driver = WebScraper(browser="firefox", hidden=False)
+    driver.get(url)
+    data = driver.get_html()
+    # save(data.text)
+    full_page = BeautifulSoup(data, "lxml")
+    author_detail = full_page.find("div", {"class" : "accordion-tabbed"})
+    output = []
+    save(full_page)
+    for author in author_detail.findAll("span", {"class" : "accordion-tabbed__tab-mobile accordion__closed"}):
+        author_name = author.find("p", {"class" : "author-name"}).text.strip()
+        if author.find("span", {"class" : "__cf_email__"}) == None:
+            continue
+        email = decode_email(author.find("span", {"class" : "__cf_email__"}).get("data-cfemail"))
+        address = author.find("p", {"class" : "author-name"}).findNext("p").text.strip()
+        output.append(
+            {
+                "Name" : author_name,
+                "Email" : email,
+                'Address' : address
+            }
+        )
+    return output