page_search

View Source

  1from __future__ import annotations
  2import os
  3import re
  4import bs4
  5from src.download import *
  6from typing import Any, Generator
  7
  8def remove_style(soup: bs4.BeautifulSoup | bs4.Tag) -> bs4.BeautifulSoup | bs4.Tag:
  9    """
 10    Removes everything between <style>...</style> tags from the content.
 11    """
 12    for style in soup("style"):
 13        style.decompose()
 14    return soup
 15
 16def remove_empty_lines(text: Any) -> str:
 17    """
 18    Removes empty lines from text.
 19    """
 20    if isinstance(text, str):
 21        return "\n".join([line for line in text.split("\n") if line.strip() != ""])
 22    return ""
 23
 24def normalize_to_beautifulsoup(soup: bs4.BeautifulSoup | bs4.Tag | requests.Response | str | bytes) -> bs4.BeautifulSoup:
 25    """
 26    Normalize input to a BeautifulSoup object.
 27    """
 28    if not isinstance(soup, bs4.BeautifulSoup):
 29        if hasattr(soup, "text"):
 30            html = soup.text or ""
 31        else:
 32            html = soup or ""
 33        if isinstance(html, bytes):
 34            try:
 35                html = html.decode("utf-8", errors="ignore")
 36            except Exception:
 37                html = str(html)
 38        soup = bs4.BeautifulSoup(html, "html.parser")
 39    return soup
 40
 41def find_label_span_by_regex(soup, regex) -> bs4.Tag | None:
 42    """
 43    Finds a <span> whose text matches the given regex.
 44    """
 45    return soup.find("span", string=re.compile(regex, re.I))
 46
 47def extract_value_from_label(label_span):
 48    """
 49    Extracts the value associated with a label span.
 50    It first looks for the next sibling <span>, and if not found, it checks the same <li> for a second <span>.
 51    """
 52    if not label_span:
 53        return ""
 54    # preferred: the following sibling <span>
 55    val_span = label_span.find_next_sibling("span")
 56    if val_span and val_span.get_text(strip=True):
 57        return val_span.get_text(strip=True)
 58    # fallback: same <li> second span
 59    li = label_span.find_parent("li")
 60    if li:
 61        spans = li.find_all("span")
 62        if len(spans) >= 2 and spans[1].get_text(strip=True):
 63            return spans[1].get_text(strip=True)
 64    return ""
 65
 66def any_text_coresponds_to(text, texts) -> bool:
 67    """
 68    Check if any text corresponds to the given texts.
 69    """
 70    return any([t in text for t in texts])
 71
 72class InsufficientTimeoutError(Exception):
 73    """
 74    Exception raised for insufficient timeout.
 75    """
 76    def __init__(self, message="Timeout is too short."):
 77        self.message = message
 78        super().__init__(self.message)
 79
 80class Download_page_search:
 81    """
 82    Abstract class for searching download pages.
 83    """
 84    
 85    file_types = {
 86        "all": "",
 87        "video": "video",
 88        "audio": "audio",
 89        "archive": "archive",
 90        "images": "image"
 91    }
 92    search_types = {
 93        "relevance": "",
 94        "most_downloaded": "3",
 95        "newest": "4",
 96        "biggest": "1",
 97        "smallest": "2"
 98    }
 99        
100    def __init__(self):
101        raise NotImplementedError()
102    
103    def search(self, prompt, file_type="all", search_type="relevance") -> Generator["Link_to_file", None, None]:
104        """
105        Search for files on the website.
106        """
107        if prompt is None or prompt.strip() == "":
108            raise ValueError("Prompt cannot be empty.")
109        raise NotImplementedError()
110    
111    @staticmethod
112    def generate_search_url(prompt, file_type="all", search_type="relevance") -> str:
113        """
114        Generate search URL from input attributes.
115        """
116        raise NotImplementedError()
117    
118    @staticmethod
119    def test_downloaded_file(link_2_file, download_folder) -> bool:
120        from src.link_to_file import compare_sizes
121        
122        file_size = os.path.getsize(f"{download_folder}/{link_2_file.title}")
123        if file_size == 0:
124            raise ValueError("ERROR: File is empty.")
125        elif link_2_file.size != None and not compare_sizes(file_size, link_2_file.size, 20/100):
126            raise ValueError("ERROR: File size does not match.")
127        return True
128    
129    @staticmethod
130    def parse_catalogue(page) -> Generator["Link_to_file", None, None]:
131        """
132        Parse the catalogue page and yield Link_to_file objects.
133        """
134        raise NotImplementedError()
135    
136    @staticmethod
137    def get_download_link_from_detail(detail_url: str) -> str:
138        """
139        Get the direct download link from the detail page URL.
140        """
141        raise NotImplementedError()
142

def remove_style( soup: bs4.BeautifulSoup | bs4.element.Tag) -> bs4.BeautifulSoup | bs4.element.Tag: View Source

 9def remove_style(soup: bs4.BeautifulSoup | bs4.Tag) -> bs4.BeautifulSoup | bs4.Tag:
10    """
11    Removes everything between <style>...</style> tags from the content.
12    """
13    for style in soup("style"):
14        style.decompose()
15    return soup

Removes everything between tags from the content.

def remove_empty_lines(text: Any) -> str: View Source

17def remove_empty_lines(text: Any) -> str:
18    """
19    Removes empty lines from text.
20    """
21    if isinstance(text, str):
22        return "\n".join([line for line in text.split("\n") if line.strip() != ""])
23    return ""

Removes empty lines from text.

def normalize_to_beautifulsoup( soup: bs4.BeautifulSoup | bs4.element.Tag | requests.models.Response | str | bytes) -> bs4.BeautifulSoup: View Source

25def normalize_to_beautifulsoup(soup: bs4.BeautifulSoup | bs4.Tag | requests.Response | str | bytes) -> bs4.BeautifulSoup:
26    """
27    Normalize input to a BeautifulSoup object.
28    """
29    if not isinstance(soup, bs4.BeautifulSoup):
30        if hasattr(soup, "text"):
31            html = soup.text or ""
32        else:
33            html = soup or ""
34        if isinstance(html, bytes):
35            try:
36                html = html.decode("utf-8", errors="ignore")
37            except Exception:
38                html = str(html)
39        soup = bs4.BeautifulSoup(html, "html.parser")
40    return soup

Normalize input to a BeautifulSoup object.

def find_label_span_by_regex(soup, regex) -> bs4.element.Tag | None: View Source

42def find_label_span_by_regex(soup, regex) -> bs4.Tag | None:
43    """
44    Finds a <span> whose text matches the given regex.
45    """
46    return soup.find("span", string=re.compile(regex, re.I))

Finds a whose text matches the given regex.

def extract_value_from_label(label_span): View Source

48def extract_value_from_label(label_span):
49    """
50    Extracts the value associated with a label span.
51    It first looks for the next sibling <span>, and if not found, it checks the same <li> for a second <span>.
52    """
53    if not label_span:
54        return ""
55    # preferred: the following sibling <span>
56    val_span = label_span.find_next_sibling("span")
57    if val_span and val_span.get_text(strip=True):
58        return val_span.get_text(strip=True)
59    # fallback: same <li> second span
60    li = label_span.find_parent("li")
61    if li:
62        spans = li.find_all("span")
63        if len(spans) >= 2 and spans[1].get_text(strip=True):
64            return spans[1].get_text(strip=True)
65    return ""

Extracts the value associated with a label span. It first looks for the next sibling , and if not found, it checks the same

for a second .

def any_text_coresponds_to(text, texts) -> bool: View Source

67def any_text_coresponds_to(text, texts) -> bool:
68    """
69    Check if any text corresponds to the given texts.
70    """
71    return any([t in text for t in texts])

Check if any text corresponds to the given texts.

class InsufficientTimeoutError(builtins.Exception): View Source

73class InsufficientTimeoutError(Exception):
74    """
75    Exception raised for insufficient timeout.
76    """
77    def __init__(self, message="Timeout is too short."):
78        self.message = message
79        super().__init__(self.message)

Exception raised for insufficient timeout.

InsufficientTimeoutError(message='Timeout is too short.') View Source

77    def __init__(self, message="Timeout is too short."):
78        self.message = message
79        super().__init__(self.message)

message

class Download_page_search: View Source

 81class Download_page_search:
 82    """
 83    Abstract class for searching download pages.
 84    """
 85    
 86    file_types = {
 87        "all": "",
 88        "video": "video",
 89        "audio": "audio",
 90        "archive": "archive",
 91        "images": "image"
 92    }
 93    search_types = {
 94        "relevance": "",
 95        "most_downloaded": "3",
 96        "newest": "4",
 97        "biggest": "1",
 98        "smallest": "2"
 99    }
100        
101    def __init__(self):
102        raise NotImplementedError()
103    
104    def search(self, prompt, file_type="all", search_type="relevance") -> Generator["Link_to_file", None, None]:
105        """
106        Search for files on the website.
107        """
108        if prompt is None or prompt.strip() == "":
109            raise ValueError("Prompt cannot be empty.")
110        raise NotImplementedError()
111    
112    @staticmethod
113    def generate_search_url(prompt, file_type="all", search_type="relevance") -> str:
114        """
115        Generate search URL from input attributes.
116        """
117        raise NotImplementedError()
118    
119    @staticmethod
120    def test_downloaded_file(link_2_file, download_folder) -> bool:
121        from src.link_to_file import compare_sizes
122        
123        file_size = os.path.getsize(f"{download_folder}/{link_2_file.title}")
124        if file_size == 0:
125            raise ValueError("ERROR: File is empty.")
126        elif link_2_file.size != None and not compare_sizes(file_size, link_2_file.size, 20/100):
127            raise ValueError("ERROR: File size does not match.")
128        return True
129    
130    @staticmethod
131    def parse_catalogue(page) -> Generator["Link_to_file", None, None]:
132        """
133        Parse the catalogue page and yield Link_to_file objects.
134        """
135        raise NotImplementedError()
136    
137    @staticmethod
138    def get_download_link_from_detail(detail_url: str) -> str:
139        """
140        Get the direct download link from the detail page URL.
141        """
142        raise NotImplementedError()

Abstract class for searching download pages.

file_types = {'all': '', 'video': 'video', 'audio': 'audio', 'archive': 'archive', 'images': 'image'}

search_types = {'relevance': '', 'most_downloaded': '3', 'newest': '4', 'biggest': '1', 'smallest': '2'}

def search( self, prompt, file_type='all', search_type='relevance') -> "Generator['Link_to_file', None, None]": View Source

104    def search(self, prompt, file_type="all", search_type="relevance") -> Generator["Link_to_file", None, None]:
105        """
106        Search for files on the website.
107        """
108        if prompt is None or prompt.strip() == "":
109            raise ValueError("Prompt cannot be empty.")
110        raise NotImplementedError()

Search for files on the website.

@staticmethod

def generate_search_url(prompt, file_type='all', search_type='relevance') -> str: View Source

112    @staticmethod
113    def generate_search_url(prompt, file_type="all", search_type="relevance") -> str:
114        """
115        Generate search URL from input attributes.
116        """
117        raise NotImplementedError()

Generate search URL from input attributes.

@staticmethod

def test_downloaded_file(link_2_file, download_folder) -> bool: View Source

119    @staticmethod
120    def test_downloaded_file(link_2_file, download_folder) -> bool:
121        from src.link_to_file import compare_sizes
122        
123        file_size = os.path.getsize(f"{download_folder}/{link_2_file.title}")
124        if file_size == 0:
125            raise ValueError("ERROR: File is empty.")
126        elif link_2_file.size != None and not compare_sizes(file_size, link_2_file.size, 20/100):
127            raise ValueError("ERROR: File size does not match.")
128        return True

@staticmethod

def parse_catalogue(page) -> "Generator['Link_to_file', None, None]": View Source

130    @staticmethod
131    def parse_catalogue(page) -> Generator["Link_to_file", None, None]:
132        """
133        Parse the catalogue page and yield Link_to_file objects.
134        """
135        raise NotImplementedError()

Parse the catalogue page and yield Link_to_file objects.

@staticmethod

def get_download_link_from_detail(detail_url: str) -> str: View Source

137    @staticmethod
138    def get_download_link_from_detail(detail_url: str) -> str:
139        """
140        Get the direct download link from the detail page URL.
141        """
142        raise NotImplementedError()

Get the direct download link from the detail page URL.