Spaces:

pryanshusharma
/

PrmScrp

Running

App Files Files Community

H4CK3R-5M4CK3R commited on Jul 30, 2024

Commit

7bed2d8

1 Parent(s): 0b55168

added multiple urls scrapper

Browse files

Files changed (3) hide show

app.py +61 -27
sciencedirect.py +2 -2
sciencedirect_admaths.py +1 -1

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ import os
 import random
 import string
 import json
-import base64
 import crypto
 auth = [
@@ -33,6 +32,8 @@ For bug reports or improvements, contact [@H4CK3R_5M4CK3R](https://t.me/H4CK3R_5
 2. **Multiple Issues Scraping:**
    - Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted.
    - Define the range for volumes, not issues. Ensure you pass the volume range correctly.
 3. **Read this before using google sheet feature**
     - **IMPORTANT** First make a google drive folder and then gave access to `[email protected]` and `[email protected]` super important.
@@ -55,16 +56,17 @@ If you are copying this space make sure to contact the owner as mention above
 """
 exmpl = [
-        ["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example1", False, True],
-        ["https://www.degruyter.com/journal/key/fca/20/2/html", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example2", False, True],
-        ["https://www.degruyter.com/journal/key/fca/{v}/{i}/html", 22, 23, "[email protected]", "asdfasdfasdfasdfasdf", "example3", False, True],
-        ["https://www.aimspress.com/math/article/2024/8/archive-articles", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example4", False, True],
-        ["https://www.aimspress.com/math/article/{v}/{i}/archive-articles", 2021, 2022, "[email protected]", "asdfasdfasdfasdfasdf", "example5", False, True],
-        ["https://link.springer.com/journal/208/volumes-and-issues/388-3", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example6", False, True],
-        ["https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}", 388, 389, "[email protected]", "asdfasdfasdfasdfasdf", "example7", False, True],
-        ["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example8", False, True],
-        ["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6", 0, 0, "[email protected]", "asdfasdfasdfasdfasdf", "example9", False, True],
-        ["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}", 37, 38, "[email protected]", "asdfasdfasdfasdfasdf", "example10", False, True]
     ]
 stop_work = False
@@ -73,7 +75,7 @@ def generate_random_filename(length=8):
     random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
     return random_string
-def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str, owner: bool, mail: str, folder_id: str, credit: dict, usegooglesheet: bool):
     if len(output.strip()) < 1:
         output = generate_random_filename()
     if os.path.exists(f"{output}.xlsx"):
@@ -84,8 +86,8 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
         shet = GoogleSheetAutomator(
             [
                 "Name",
-                "Email",
-                "Address"
             ],
             folder_id,
             outputfile=output,
@@ -93,8 +95,8 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
         )
     sht = ExcelAutomator([
         "Name",
-        "Email",
-        "Address"
     ], output)
     filen = True
     if "{" in url:
@@ -266,6 +268,7 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
                 yield {"final_output": sht.save_to_file(), "link" : ""}
         elif domain == "www.sciencedirect.com":
             for ur in links:
                 isu = 1
                 while True:
@@ -274,11 +277,19 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
                     try:
                         if filen:
                             current_url = ur.format(i=isu)
-                            authors, _ = sciencedirect.run(current_url)
                             isu += 1
                         else:
                             current_url = ur
-                            authors, _ = sciencedirect.run(current_url)
                     except Exception as e:
                         break
                     yield {"current_url": current_url, "status": "fetching"}
@@ -303,6 +314,7 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
                 yield {"final_output": sht.save_to_file(), "link" : ""}
         elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url:
             for ur in links:
                 isu = 1
                 while True:
@@ -311,11 +323,19 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
                     try:
                         if filen:
                             current_url = ur.format(i=isu)
-                            authors, _ = sciencedirect_admaths.run(current_url)
                             isu += 1
                         else:
                             current_url = ur
-                            authors, _ = sciencedirect_admaths.run(current_url)
                     except Exception as e:
                         break
                     yield {"current_url": current_url, "status": "fetching"}
@@ -354,25 +374,35 @@ def handle_url(Link: str, Gmail: str, FolderId: str, Output: str, MakeOwner:bool
     authors = []
     details = []
     final_output = None
     link = None
     From_volume = 0
     To_Volume = 0
     urls = []
     if "\n" in Link:
         urls = Link.split("\n")
     else:
         urls.append(Link)
     for url in urls:
         if url.startswith("("):
-            From_volume = url.split("(", 1)[1].split(":")[0]
-            To_Volume = url.split("(", 1)[1].split(":")[1].split(")", 1)[0]
             url = url.split("-", 1)[1].strip()
-        credit = crypto.decrypt(os.getenv("KEY"), os.getenv("GOOGLE_AUTH_CREDENTIALS"))
-        credit = json.loads(credit)
-        for _, result in enumerate(filterUrlandRun(url, From_volume, To_Volume, Reverse, Output, MakeOwner, Gmail, FolderId, credit=credit, usegooglesheet=UseGoogleSheet)):
             if "final_output" in result:
-                final_output = result["final_output"]
                 link = result["link"]
             else:
                 if "author" in result:
@@ -383,11 +413,15 @@ def handle_url(Link: str, Gmail: str, FolderId: str, Output: str, MakeOwner:bool
                     details.append(f"Scraping: {current_url}\n")
             authors = authors[-3:] if len(authors) > 3 else authors
             details = details[-3:] if len(details) > 3 else details
             yield "\n".join(authors), "\n".join(details), final_output if final_output else None, gr.Audio("notification.mp3", autoplay=True) if final_output else None, link
 interface = gr.Interface(
     fn=handle_url,
-    inputs=["textbox", gr.Textbox(label="Access Gmail (Check Docs)"), gr.Textbox(label="Google Folder ID (Check Docs)"), gr.Textbox(label="Output File Name"), gr.Checkbox(True, label="Make Owner"), gr.Checkbox(True, label="Use Google Sheets"), "checkbox"],
     outputs=[gr.Markdown("LOGS", height="250px", elem_id="authorscroll"), gr.Markdown("", height="100px", elem_id="authorscroll"), "file", "audio", "textbox"],
     title="Web Scraper",
     description=description,

 import random
 import string
 import json
 import crypto
 auth = [
 2. **Multiple Issues Scraping:**
    - Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted.
    - Define the range for volumes, not issues. Ensure you pass the volume range correctly.
+   - To pass the range you must have use () in start of the url something like (to:from)-link/{v}/{i} checkout example below.
+   - Now you can also pass the multiple links too and add range into same as above.
 3. **Read this before using google sheet feature**
     - **IMPORTANT** First make a google drive folder and then gave access to `[email protected]` and `[email protected]` super important.
 """
 exmpl = [
+        ["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues", "[email protected]", "asdfasdfasdfasdfasdf", "example1", True, True, False],
+        ["https://www.degruyter.com/journal/key/fca/20/2/html", "[email protected]", "asdfasdfasdfasdfasdf", "example2", True, True, False],
+        ["(22:23)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "[email protected]", "asdfasdfasdfasdfasdf", "example3", True, True, False],
+        ["https://www.aimspress.com/math/article/2024/8/archive-articles", "[email protected]", "asdfasdfasdfasdfasdf", "example4", True, True, False],
+        ["(2021:2022)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles", "[email protected]", "asdfasdfasdfasdfasdf", "example5", True, True, False],
+        ["https://link.springer.com/journal/208/volumes-and-issues/388-3", "[email protected]", "asdfasdfasdfasdfasdf", "example6", True, True, False],
+        ["(388:389)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}", "[email protected]", "asdfasdfasdfasdfasdf", "example7", True, True, False],
+        ["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C", "[email protected]", "asdfasdfasdfasdfasdf", "example8", True, True, False],
+        ["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6", "[email protected]", "asdfasdfasdfasdfasdf", "example9", True, True, False],
+        ["(37:38)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}", "[email protected]", "asdfasdfasdfasdfasdf", "example10", True, True, False],
+        ["(37:38)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}\n(388:389)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}\n(2021:2022)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles\n(22:23)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "[email protected]", "asdfasdfasdfasdfasdf", "example11", True, True, False]
     ]
 stop_work = False
     random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
     return random_string
+def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str, owner: bool, mail: str, folder_id: str, credit: dict, usegooglesheet: bool, multiUrls: bool):
     if len(output.strip()) < 1:
         output = generate_random_filename()
     if os.path.exists(f"{output}.xlsx"):
         shet = GoogleSheetAutomator(
             [
                 "Name",
+                "Address",
+                "Email"
             ],
             folder_id,
             outputfile=output,
         )
     sht = ExcelAutomator([
         "Name",
+        "Address",
+        "Email"
     ], output)
     filen = True
     if "{" in url:
                 yield {"final_output": sht.save_to_file(), "link" : ""}
         elif domain == "www.sciencedirect.com":
+            oldtitle = ""
             for ur in links:
                 isu = 1
                 while True:
                     try:
                         if filen:
                             current_url = ur.format(i=isu)
+                            authors, title = sciencedirect.run(current_url)
+                            if title == oldtitle:
+                                break
+                            else:
+                                oldtitle = title
                             isu += 1
                         else:
                             current_url = ur
+                            authors, title = sciencedirect.run(current_url)
+                            if title == oldtitle:
+                                break
+                            else:
+                                oldtitle = title
                     except Exception as e:
                         break
                     yield {"current_url": current_url, "status": "fetching"}
                 yield {"final_output": sht.save_to_file(), "link" : ""}
         elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url:
+            oldtitle = ""
             for ur in links:
                 isu = 1
                 while True:
                     try:
                         if filen:
                             current_url = ur.format(i=isu)
+                            authors, title = sciencedirect_admaths.run(current_url)
+                            if title == oldtitle:
+                                break
+                            else:
+                                oldtitle = title
                             isu += 1
                         else:
                             current_url = ur
+                            authors, title = sciencedirect_admaths.run(current_url)
+                            if title == oldtitle:
+                                break
+                            else:
+                                oldtitle = title
                     except Exception as e:
                         break
                     yield {"current_url": current_url, "status": "fetching"}
     authors = []
     details = []
     final_output = None
+    multi_output = None
     link = None
     From_volume = 0
     To_Volume = 0
     urls = []
     if "\n" in Link:
         urls = Link.split("\n")
+        multiUrls = True
     else:
         urls.append(Link)
+        multiUrls = False
+    print(f"URLS : {urls}")
     for url in urls:
         if url.startswith("("):
+            From_volume = int(url.split("(", 1)[1].split(":")[0])
+            To_Volume = int(url.split("(", 1)[1].split(":")[1].split(")", 1)[0])
             url = url.split("-", 1)[1].strip()
+        try:
+            credit = crypto.decrypt(os.getenv("KEY"), os.getenv("GOOGLE_AUTH_CREDENTIALS"))
+            credit = json.loads(credit)
+        except:
+            pass
+        for _, result in enumerate(filterUrlandRun(url, From_volume, To_Volume, Reverse, Output, MakeOwner, Gmail, FolderId, credit=credit, usegooglesheet=UseGoogleSheet, multiUrls=multiUrls)):
             if "final_output" in result:
+                if multiUrls == False:
+                    final_output = result["final_output"]
+                else:
+                    multi_output = result["final_output"]
                 link = result["link"]
             else:
                 if "author" in result:
                     details.append(f"Scraping: {current_url}\n")
             authors = authors[-3:] if len(authors) > 3 else authors
             details = details[-3:] if len(details) > 3 else details
+            if multiUrls:
+                final_output = None
             yield "\n".join(authors), "\n".join(details), final_output if final_output else None, gr.Audio("notification.mp3", autoplay=True) if final_output else None, link
+        if multiUrls == True:
+            yield "\n".join(authors), "\n".join(details), multi_output, gr.Audio("notification.mp3", autoplay=True), link
 interface = gr.Interface(
     fn=handle_url,
+    inputs=[gr.TextArea(label="Url / Url's", placeholder="Enter the url or multiple urls to scrap"), gr.Textbox(label="Access Gmail (Check Docs)"), gr.Textbox(label="Google Folder ID (Check Docs)"), gr.Textbox(label="Output File Name"), gr.Checkbox(True, label="Make Owner"), gr.Checkbox(True, label="Use Google Sheets"), "checkbox"],
     outputs=[gr.Markdown("LOGS", height="250px", elem_id="authorscroll"), gr.Markdown("", height="100px", elem_id="authorscroll"), "file", "audio", "textbox"],
     title="Web Scraper",
     description=description,

sciencedirect.py CHANGED Viewed

@@ -80,7 +80,7 @@ Sec-GPC: 1
     # url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}"
     data = req.get(url, headers=headers)
     artical_links = []
     fullpage = BeautifulSoup(str(data.text), "lxml")
     if fullpage.title.string.strip() == last_artical_name:
@@ -169,7 +169,7 @@ Sec-GPC: 1
         except:
             address = "Not Found"
         if address == "Not Found":
-            address = url
         output.append(
             {
                 'Name' : aut['Name'],

     # url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}"
     data = req.get(url, headers=headers)
+    print(f"URL : {data.url}")
     artical_links = []
     fullpage = BeautifulSoup(str(data.text), "lxml")
     if fullpage.title.string.strip() == last_artical_name:
         except:
             address = "Not Found"
         if address == "Not Found":
+            address = ""
         output.append(
             {
                 'Name' : aut['Name'],

sciencedirect_admaths.py CHANGED Viewed

@@ -169,7 +169,7 @@ Sec-GPC: 1
         except:
             address = "Not Found"
         if address == "Not Found":
-            address = url
         output.append(
             {
                 'Name' : aut['Name'],

         except:
             address = "Not Found"
         if address == "Not Found":
+            address = ""
         output.append(
             {
                 'Name' : aut['Name'],