Spaces:

pryanshusharma
/

PrmScrp

Running

App Files Files Community

H4CK3R-5M4CK3R commited on Jul 28, 2024

Commit

c83cceb

1 Parent(s): ccab18e

added docs

Browse files

Files changed (2) hide show

app.py +60 -3
wileyscrapper.py +5 -22

app.py CHANGED Viewed

@@ -17,6 +17,38 @@ from builtins import print as original_print
 import random
 import string
 def generate_random_filename(length=8):
     random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
     return random_string
@@ -208,6 +240,30 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
                         break
                     sht.save_to_file()
             return sht.save_to_file()
         else:
             raise gr.Error("Invalid url found contact : @H4CK3R_5M4CK3R on telegram")
     except gr.Error:
@@ -216,8 +272,8 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
         print(f"ERROR : {traceback.format_exc()}", show=False)
         raise gr.Error("Something error has occur check your url or contact @h4ck3r_5m4ck3r on telegram")
-def handle_url(url, From_volume: int, To_Volume: int, Output: str, Reverse: bool):
-    output = filterUrlandRun(url, From_volume, To_Volume, Reverse, Output)
     # threading.Thread(target=play_sound).start()
     return output, gr.Audio("notification.mp3", autoplay=True)
@@ -226,7 +282,8 @@ interface = gr.Interface(
     inputs=["textbox", "number", "number", "textbox","checkbox"],
     outputs=["file", "audio"],
     title="Web Scrapper",
-    description="Enter a URL and download a generated XLSX file."
 )
 interface.launch()

 import random
 import string
+description = """
+For bug reports or improvements, contact [@H4CK3R_5M4CK3R](https://t.me/H4CK3R_5M4CK3R) on Telegram.
+**Usage Instructions:**
+1. **Single Issue Scraping:**
+   - Provide the issue link in the URL section.
+   - Optionally, specify the desired output file name.
+2. **Multiple Issues Scraping:**
+   - Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted.
+   - Define the range for volumes, not issues. Ensure you pass the volume range correctly.
+**Note:**
+- The range should be the volume range, not the issue range.
+- Some authors may not have a listed address; their profile link will be included in the address section instead.
+- After progress is completed make sure to click on clear because sometimes notification does't ring
+"""
+exmpl = [
+        ["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues", 0, 0, "example1", False],
+        ["https://www.degruyter.com/journal/key/fca/20/2/html", 0, 0, "example2", False],
+        ["https://www.degruyter.com/journal/key/fca/{v}/{i}/html", 20, 23, "example3", False],
+        ["https://www.aimspress.com/math/article/2024/8/archive-articles", 0, 0, "example4", False],
+        ["https://www.aimspress.com/math/article/{v}/{i}/archive-articles", 2021, 2024, "example5", False],
+        ["https://link.springer.com/journal/208/volumes-and-issues/388-3", 0, 0, "example6", False],
+        ["https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}", 388, 389, "example6", False],
+        ["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C", 0, 0, "example7", False],
+        ["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6", 0, 0, "example7", False],
+        ["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}", 36, 38, "example7", False]
+    ]
 def generate_random_filename(length=8):
     random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
     return random_string
                         break
                     sht.save_to_file()
             return sht.save_to_file()
+        elif domain == "onlinelibrary.wiley.com":
+            # acta mathematic scientia data here
+            for ur in links:
+                isu = 1
+                while True:
+                    try:
+                        if filen:
+                            print(f"Getting data for link {ur.format(i=isu)}")
+                            authors, _ = wileyscrapper.run(ur.format(i=isu))
+                            isu += 1
+                        else:
+                            print(f"Getting data for link {ur}")
+                            authors, _ = sciencedirect_admaths.run(ur)
+                    except Exception as e:
+                        print(f"Error : {traceback.format_exc()}", show=False)
+                        break
+                    for auth in authors:
+                        sht.save(auth)
+                    if filen == False: # If filen is true then dont need to start the loop
+                        break
+                    sht.save_to_file()
+            return sht.save_to_file()
         else:
             raise gr.Error("Invalid url found contact : @H4CK3R_5M4CK3R on telegram")
     except gr.Error:
         print(f"ERROR : {traceback.format_exc()}", show=False)
         raise gr.Error("Something error has occur check your url or contact @h4ck3r_5m4ck3r on telegram")
+def handle_url(Url, From_volume: int, To_Volume: int, Output: str, Reverse: bool):
+    output = filterUrlandRun(Url, From_volume, To_Volume, Reverse, Output)
     # threading.Thread(target=play_sound).start()
     return output, gr.Audio("notification.mp3", autoplay=True)
     inputs=["textbox", "number", "number", "textbox","checkbox"],
     outputs=["file", "audio"],
     title="Web Scrapper",
+    description=description,
+    examples=exmpl
 )
 interface.launch()

wileyscrapper.py CHANGED Viewed

@@ -3,8 +3,6 @@ from bs4 import BeautifulSoup
 from sheets import ExcelAutomator
 from seleliumdriver import WebScraper
-browser = requests.session()
 def save(data:str):
     with open("data.html", "w") as op:
         op.write(str(data))
@@ -35,26 +33,11 @@ def get_headers(data: str) -> dict:
         out[key] = value
     return out
-def get_links(url: str, issue: int) -> list:
-    headers = """
-    User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
-    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
-    Accept-Language: en-US,en;q=0.5
-    Accept-Encoding: gzip, deflate, br
-    Alt-Used: onlinelibrary.wiley.com
-    Connection: keep-alive
-    Cookie: MAID=5SmfjugKsbanANmqY7QFTQ==; MACHINE_LAST_SEEN=2024-07-13T00%3A17%3A54.434-07%3A00; osano_consentmanager_uuid=a09cf48c-a316-44da-a630-b284fe618561; osano_consentmanager=wtFVO73sxrqPK1QjgWvz2PRznZ_IuLc6ARcv2t0_pFtepafXHZgrg-S478uJo9AvbIWsu3sbpgmvaCKL_zNkJQZzpvdHNzGX6NQQ6cwL_c09p-7H9gmYq7lFeOBlGJxYVbwgVIa5TZDqtpLjvla4iYf-rEyPZ0zXi8nZVVY5aCRrKBkWiIYkwIWvpeVeBLepXirD0RkYCGg-O2PWE000CQi4kWVXGTOkNMFqFOSQ-tthQqpC7pvT9AeCAodC2z6CeM6tTjz3TNmp8sTxikwwT4jzZ9HRy76gqQjb8g==; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_identity=CiY4MDg5NTE5MTAxMTg2NDkzMzQzMTI2OTY5MjMzMTU3OTYwODc1N1ITCM6izY3mMRABGAEqBElORDEwAPAB5cnS14oy; Hm_lvt_953dddc9c7bea72022e3bd3ba7782e7b=1720765103,1720855080; AMCV_1B6E34B85282A0AC0A490D44%40AdobeOrg=MCMID|80895191011864933431269692331579608757; JSESSIONID=90BFBDCF8874DBB2B708D37ACC4172DD; __cf_bm=FgCtBcokrG75eoj6.nqj2jTcbcl.vtSPGArq4iAYwYk-1720855074-1.0.1.1-OCKWcrDvKtyaaNLld1aBjaFFwZLoLHauSzJ0NEZFn1JLYK4G4lqmaTMEE50PAzZCReTc13aRgLNyLlqu6JOllleWjBRMQr5vc3YjxJ4kdPs; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_cluster=ind1; cf_clearance=B0r0CEgCWVP2M5CKvRhRTvIW8MyIJM2WBVS14NsHxxE-1720855079-1.0.1.1-CqrZHd19zoe3QCemtBtqxsHiVLXILmnPkb9RjSG2yHndhy.XZzt14jGpjymiEPzjA0nFP7xw1hU6xsXIz6UDSg; Hm_lpvt_953dddc9c7bea72022e3bd3ba7782e7b=1720855160; HMACCOUNT=C851A9F6625CC221; randomizeUser=0.5543043437474287
-    Upgrade-Insecure-Requests: 1
-    Sec-Fetch-Dest: document
-    Sec-Fetch-Mode: navigate
-    Sec-Fetch-Site: none
-    Sec-Fetch-User: ?1
-    Sec-GPC: 1
-    TE: trailers
-    """
-    # url = f"https://onlinelibrary.wiley.com/toc/14679590/{year}/{volume}/{issue}"
-    data = browser.get(url, headers=get_headers(headers))
-    fullPage = BeautifulSoup(data.text, "lxml")
     issuelinks = []
     for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}):
         issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}')

 from sheets import ExcelAutomator
 from seleliumdriver import WebScraper
 def save(data:str):
     with open("data.html", "w") as op:
         op.write(str(data))
         out[key] = value
     return out
+def get_links(url: str) -> list:
+    browser = WebScraper(browser="firefox", hidden=False)
+    browser.get(url) #browser.get(url, headers=get_headers(headers))
+    fullPage = BeautifulSoup(browser.get_html(), "lxml")
+    save(browser.get_html())
     issuelinks = []
     for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}):
         issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}')