page_search
1from __future__ import annotations 2import os 3import re 4import bs4 5from src.download import * 6from typing import Any, Generator 7 8def remove_style(soup: bs4.BeautifulSoup | bs4.Tag) -> bs4.BeautifulSoup | bs4.Tag: 9 """ 10 Removes everything between <style>...</style> tags from the content. 11 """ 12 for style in soup("style"): 13 style.decompose() 14 return soup 15 16def remove_empty_lines(text: Any) -> str: 17 """ 18 Removes empty lines from text. 19 """ 20 if isinstance(text, str): 21 return "\n".join([line for line in text.split("\n") if line.strip() != ""]) 22 return "" 23 24def normalize_to_beautifulsoup(soup: bs4.BeautifulSoup | bs4.Tag | requests.Response | str | bytes) -> bs4.BeautifulSoup: 25 """ 26 Normalize input to a BeautifulSoup object. 27 """ 28 if not isinstance(soup, bs4.BeautifulSoup): 29 if hasattr(soup, "text"): 30 html = soup.text or "" 31 else: 32 html = soup or "" 33 if isinstance(html, bytes): 34 try: 35 html = html.decode("utf-8", errors="ignore") 36 except Exception: 37 html = str(html) 38 soup = bs4.BeautifulSoup(html, "html.parser") 39 return soup 40 41def find_label_span_by_regex(soup, regex) -> bs4.Tag | None: 42 """ 43 Finds a <span> whose text matches the given regex. 44 """ 45 return soup.find("span", string=re.compile(regex, re.I)) 46 47def extract_value_from_label(label_span): 48 """ 49 Extracts the value associated with a label span. 50 It first looks for the next sibling <span>, and if not found, it checks the same <li> for a second <span>. 51 """ 52 if not label_span: 53 return "" 54 # preferred: the following sibling <span> 55 val_span = label_span.find_next_sibling("span") 56 if val_span and val_span.get_text(strip=True): 57 return val_span.get_text(strip=True) 58 # fallback: same <li> second span 59 li = label_span.find_parent("li") 60 if li: 61 spans = li.find_all("span") 62 if len(spans) >= 2 and spans[1].get_text(strip=True): 63 return spans[1].get_text(strip=True) 64 return "" 65 66def any_text_coresponds_to(text, texts) -> bool: 67 """ 68 Check if any text corresponds to the given texts. 69 """ 70 return any([t in text for t in texts]) 71 72class InsufficientTimeoutError(Exception): 73 """ 74 Exception raised for insufficient timeout. 75 """ 76 def __init__(self, message="Timeout is too short."): 77 self.message = message 78 super().__init__(self.message) 79 80class Download_page_search: 81 """ 82 Abstract class for searching download pages. 83 """ 84 85 file_types = { 86 "all": "", 87 "video": "video", 88 "audio": "audio", 89 "archive": "archive", 90 "images": "image" 91 } 92 search_types = { 93 "relevance": "", 94 "most_downloaded": "3", 95 "newest": "4", 96 "biggest": "1", 97 "smallest": "2" 98 } 99 100 def __init__(self): 101 raise NotImplementedError() 102 103 def search(self, prompt, file_type="all", search_type="relevance") -> Generator["Link_to_file", None, None]: 104 """ 105 Search for files on the website. 106 """ 107 if prompt is None or prompt.strip() == "": 108 raise ValueError("Prompt cannot be empty.") 109 raise NotImplementedError() 110 111 @staticmethod 112 def generate_search_url(prompt, file_type="all", search_type="relevance") -> str: 113 """ 114 Generate search URL from input attributes. 115 """ 116 raise NotImplementedError() 117 118 @staticmethod 119 def test_downloaded_file(link_2_file, download_folder) -> bool: 120 from src.link_to_file import compare_sizes 121 122 file_size = os.path.getsize(f"{download_folder}/{link_2_file.title}") 123 if file_size == 0: 124 raise ValueError("ERROR: File is empty.") 125 elif link_2_file.size != None and not compare_sizes(file_size, link_2_file.size, 20/100): 126 raise ValueError("ERROR: File size does not match.") 127 return True 128 129 @staticmethod 130 def parse_catalogue(page) -> Generator["Link_to_file", None, None]: 131 """ 132 Parse the catalogue page and yield Link_to_file objects. 133 """ 134 raise NotImplementedError() 135 136 @staticmethod 137 def get_download_link_from_detail(detail_url: str) -> str: 138 """ 139 Get the direct download link from the detail page URL. 140 """ 141 raise NotImplementedError() 142
def
remove_style( soup: bs4.BeautifulSoup | bs4.element.Tag) -> bs4.BeautifulSoup | bs4.element.Tag:
9def remove_style(soup: bs4.BeautifulSoup | bs4.Tag) -> bs4.BeautifulSoup | bs4.Tag: 10 """ 11 Removes everything between <style>...</style> tags from the content. 12 """ 13 for style in soup("style"): 14 style.decompose() 15 return soup
Removes everything between tags from the content.
def
remove_empty_lines(text: Any) -> str:
17def remove_empty_lines(text: Any) -> str: 18 """ 19 Removes empty lines from text. 20 """ 21 if isinstance(text, str): 22 return "\n".join([line for line in text.split("\n") if line.strip() != ""]) 23 return ""
Removes empty lines from text.
def
normalize_to_beautifulsoup( soup: bs4.BeautifulSoup | bs4.element.Tag | requests.models.Response | str | bytes) -> bs4.BeautifulSoup:
25def normalize_to_beautifulsoup(soup: bs4.BeautifulSoup | bs4.Tag | requests.Response | str | bytes) -> bs4.BeautifulSoup: 26 """ 27 Normalize input to a BeautifulSoup object. 28 """ 29 if not isinstance(soup, bs4.BeautifulSoup): 30 if hasattr(soup, "text"): 31 html = soup.text or "" 32 else: 33 html = soup or "" 34 if isinstance(html, bytes): 35 try: 36 html = html.decode("utf-8", errors="ignore") 37 except Exception: 38 html = str(html) 39 soup = bs4.BeautifulSoup(html, "html.parser") 40 return soup
Normalize input to a BeautifulSoup object.
def
find_label_span_by_regex(soup, regex) -> bs4.element.Tag | None:
42def find_label_span_by_regex(soup, regex) -> bs4.Tag | None: 43 """ 44 Finds a <span> whose text matches the given regex. 45 """ 46 return soup.find("span", string=re.compile(regex, re.I))
Finds a whose text matches the given regex.
def
extract_value_from_label(label_span):
48def extract_value_from_label(label_span): 49 """ 50 Extracts the value associated with a label span. 51 It first looks for the next sibling <span>, and if not found, it checks the same <li> for a second <span>. 52 """ 53 if not label_span: 54 return "" 55 # preferred: the following sibling <span> 56 val_span = label_span.find_next_sibling("span") 57 if val_span and val_span.get_text(strip=True): 58 return val_span.get_text(strip=True) 59 # fallback: same <li> second span 60 li = label_span.find_parent("li") 61 if li: 62 spans = li.find_all("span") 63 if len(spans) >= 2 and spans[1].get_text(strip=True): 64 return spans[1].get_text(strip=True) 65 return ""
Extracts the value associated with a label span.
It first looks for the next sibling , and if not found, it checks the same
def
any_text_coresponds_to(text, texts) -> bool:
67def any_text_coresponds_to(text, texts) -> bool: 68 """ 69 Check if any text corresponds to the given texts. 70 """ 71 return any([t in text for t in texts])
Check if any text corresponds to the given texts.
class
InsufficientTimeoutError(builtins.Exception):
73class InsufficientTimeoutError(Exception): 74 """ 75 Exception raised for insufficient timeout. 76 """ 77 def __init__(self, message="Timeout is too short."): 78 self.message = message 79 super().__init__(self.message)
Exception raised for insufficient timeout.
class
Download_page_search:
81class Download_page_search: 82 """ 83 Abstract class for searching download pages. 84 """ 85 86 file_types = { 87 "all": "", 88 "video": "video", 89 "audio": "audio", 90 "archive": "archive", 91 "images": "image" 92 } 93 search_types = { 94 "relevance": "", 95 "most_downloaded": "3", 96 "newest": "4", 97 "biggest": "1", 98 "smallest": "2" 99 } 100 101 def __init__(self): 102 raise NotImplementedError() 103 104 def search(self, prompt, file_type="all", search_type="relevance") -> Generator["Link_to_file", None, None]: 105 """ 106 Search for files on the website. 107 """ 108 if prompt is None or prompt.strip() == "": 109 raise ValueError("Prompt cannot be empty.") 110 raise NotImplementedError() 111 112 @staticmethod 113 def generate_search_url(prompt, file_type="all", search_type="relevance") -> str: 114 """ 115 Generate search URL from input attributes. 116 """ 117 raise NotImplementedError() 118 119 @staticmethod 120 def test_downloaded_file(link_2_file, download_folder) -> bool: 121 from src.link_to_file import compare_sizes 122 123 file_size = os.path.getsize(f"{download_folder}/{link_2_file.title}") 124 if file_size == 0: 125 raise ValueError("ERROR: File is empty.") 126 elif link_2_file.size != None and not compare_sizes(file_size, link_2_file.size, 20/100): 127 raise ValueError("ERROR: File size does not match.") 128 return True 129 130 @staticmethod 131 def parse_catalogue(page) -> Generator["Link_to_file", None, None]: 132 """ 133 Parse the catalogue page and yield Link_to_file objects. 134 """ 135 raise NotImplementedError() 136 137 @staticmethod 138 def get_download_link_from_detail(detail_url: str) -> str: 139 """ 140 Get the direct download link from the detail page URL. 141 """ 142 raise NotImplementedError()
Abstract class for searching download pages.
file_types =
{'all': '', 'video': 'video', 'audio': 'audio', 'archive': 'archive', 'images': 'image'}
search_types =
{'relevance': '', 'most_downloaded': '3', 'newest': '4', 'biggest': '1', 'smallest': '2'}
def
search( self, prompt, file_type='all', search_type='relevance') -> "Generator['Link_to_file', None, None]":
104 def search(self, prompt, file_type="all", search_type="relevance") -> Generator["Link_to_file", None, None]: 105 """ 106 Search for files on the website. 107 """ 108 if prompt is None or prompt.strip() == "": 109 raise ValueError("Prompt cannot be empty.") 110 raise NotImplementedError()
Search for files on the website.
@staticmethod
def
generate_search_url(prompt, file_type='all', search_type='relevance') -> str:
112 @staticmethod 113 def generate_search_url(prompt, file_type="all", search_type="relevance") -> str: 114 """ 115 Generate search URL from input attributes. 116 """ 117 raise NotImplementedError()
Generate search URL from input attributes.
@staticmethod
def
test_downloaded_file(link_2_file, download_folder) -> bool:
119 @staticmethod 120 def test_downloaded_file(link_2_file, download_folder) -> bool: 121 from src.link_to_file import compare_sizes 122 123 file_size = os.path.getsize(f"{download_folder}/{link_2_file.title}") 124 if file_size == 0: 125 raise ValueError("ERROR: File is empty.") 126 elif link_2_file.size != None and not compare_sizes(file_size, link_2_file.size, 20/100): 127 raise ValueError("ERROR: File size does not match.") 128 return True
@staticmethod
def
parse_catalogue(page) -> "Generator['Link_to_file', None, None]":
130 @staticmethod 131 def parse_catalogue(page) -> Generator["Link_to_file", None, None]: 132 """ 133 Parse the catalogue page and yield Link_to_file objects. 134 """ 135 raise NotImplementedError()
Parse the catalogue page and yield Link_to_file objects.
@staticmethod
def
get_download_link_from_detail(detail_url: str) -> str:
137 @staticmethod 138 def get_download_link_from_detail(detail_url: str) -> str: 139 """ 140 Get the direct download link from the detail page URL. 141 """ 142 raise NotImplementedError()
Get the direct download link from the detail page URL.