page_search

  1from __future__ import annotations
  2import os
  3import re
  4import bs4
  5from src.download import *
  6from typing import Any, Generator
  7
  8def remove_style(soup: bs4.BeautifulSoup | bs4.Tag) -> bs4.BeautifulSoup | bs4.Tag:
  9    """
 10    Removes everything between <style>...</style> tags from the content.
 11    """
 12    for style in soup("style"):
 13        style.decompose()
 14    return soup
 15
 16def remove_empty_lines(text: Any) -> str:
 17    """
 18    Removes empty lines from text.
 19    """
 20    if isinstance(text, str):
 21        return "\n".join([line for line in text.split("\n") if line.strip() != ""])
 22    return ""
 23
 24def normalize_to_beautifulsoup(soup: bs4.BeautifulSoup | bs4.Tag | requests.Response | str | bytes) -> bs4.BeautifulSoup:
 25    """
 26    Normalize input to a BeautifulSoup object.
 27    """
 28    if not isinstance(soup, bs4.BeautifulSoup):
 29        if hasattr(soup, "text"):
 30            html = soup.text or ""
 31        else:
 32            html = soup or ""
 33        if isinstance(html, bytes):
 34            try:
 35                html = html.decode("utf-8", errors="ignore")
 36            except Exception:
 37                html = str(html)
 38        soup = bs4.BeautifulSoup(html, "html.parser")
 39    return soup
 40
 41def find_label_span_by_regex(soup, regex) -> bs4.Tag | None:
 42    """
 43    Finds a <span> whose text matches the given regex.
 44    """
 45    return soup.find("span", string=re.compile(regex, re.I))
 46
 47def extract_value_from_label(label_span):
 48    """
 49    Extracts the value associated with a label span.
 50    It first looks for the next sibling <span>, and if not found, it checks the same <li> for a second <span>.
 51    """
 52    if not label_span:
 53        return ""
 54    # preferred: the following sibling <span>
 55    val_span = label_span.find_next_sibling("span")
 56    if val_span and val_span.get_text(strip=True):
 57        return val_span.get_text(strip=True)
 58    # fallback: same <li> second span
 59    li = label_span.find_parent("li")
 60    if li:
 61        spans = li.find_all("span")
 62        if len(spans) >= 2 and spans[1].get_text(strip=True):
 63            return spans[1].get_text(strip=True)
 64    return ""
 65
 66def any_text_coresponds_to(text, texts) -> bool:
 67    """
 68    Check if any text corresponds to the given texts.
 69    """
 70    return any([t in text for t in texts])
 71
 72class InsufficientTimeoutError(Exception):
 73    """
 74    Exception raised for insufficient timeout.
 75    """
 76    def __init__(self, message="Timeout is too short."):
 77        self.message = message
 78        super().__init__(self.message)
 79
 80class Download_page_search:
 81    """
 82    Abstract class for searching download pages.
 83    """
 84    
 85    file_types = {
 86        "all": "",
 87        "video": "video",
 88        "audio": "audio",
 89        "archive": "archive",
 90        "images": "image"
 91    }
 92    search_types = {
 93        "relevance": "",
 94        "most_downloaded": "3",
 95        "newest": "4",
 96        "biggest": "1",
 97        "smallest": "2"
 98    }
 99        
100    def __init__(self):
101        raise NotImplementedError()
102    
103    def search(self, prompt, file_type="all", search_type="relevance") -> Generator["Link_to_file", None, None]:
104        """
105        Search for files on the website.
106        """
107        if prompt is None or prompt.strip() == "":
108            raise ValueError("Prompt cannot be empty.")
109        raise NotImplementedError()
110    
111    @staticmethod
112    def generate_search_url(prompt, file_type="all", search_type="relevance") -> str:
113        """
114        Generate search URL from input attributes.
115        """
116        raise NotImplementedError()
117    
118    @staticmethod
119    def test_downloaded_file(link_2_file, download_folder) -> bool:
120        from src.link_to_file import compare_sizes
121        
122        file_size = os.path.getsize(f"{download_folder}/{link_2_file.title}")
123        if file_size == 0:
124            raise ValueError("ERROR: File is empty.")
125        elif link_2_file.size != None and not compare_sizes(file_size, link_2_file.size, 20/100):
126            raise ValueError("ERROR: File size does not match.")
127        return True
128    
129    @staticmethod
130    def parse_catalogue(page) -> Generator["Link_to_file", None, None]:
131        """
132        Parse the catalogue page and yield Link_to_file objects.
133        """
134        raise NotImplementedError()
135    
136    @staticmethod
137    def get_download_link_from_detail(detail_url: str) -> str:
138        """
139        Get the direct download link from the detail page URL.
140        """
141        raise NotImplementedError()
142        
def remove_style( soup: bs4.BeautifulSoup | bs4.element.Tag) -> bs4.BeautifulSoup | bs4.element.Tag:
 9def remove_style(soup: bs4.BeautifulSoup | bs4.Tag) -> bs4.BeautifulSoup | bs4.Tag:
10    """
11    Removes everything between <style>...</style> tags from the content.
12    """
13    for style in soup("style"):
14        style.decompose()
15    return soup

Removes everything between tags from the content.

def remove_empty_lines(text: Any) -> str:
17def remove_empty_lines(text: Any) -> str:
18    """
19    Removes empty lines from text.
20    """
21    if isinstance(text, str):
22        return "\n".join([line for line in text.split("\n") if line.strip() != ""])
23    return ""

Removes empty lines from text.

def normalize_to_beautifulsoup( soup: bs4.BeautifulSoup | bs4.element.Tag | requests.models.Response | str | bytes) -> bs4.BeautifulSoup:
25def normalize_to_beautifulsoup(soup: bs4.BeautifulSoup | bs4.Tag | requests.Response | str | bytes) -> bs4.BeautifulSoup:
26    """
27    Normalize input to a BeautifulSoup object.
28    """
29    if not isinstance(soup, bs4.BeautifulSoup):
30        if hasattr(soup, "text"):
31            html = soup.text or ""
32        else:
33            html = soup or ""
34        if isinstance(html, bytes):
35            try:
36                html = html.decode("utf-8", errors="ignore")
37            except Exception:
38                html = str(html)
39        soup = bs4.BeautifulSoup(html, "html.parser")
40    return soup

Normalize input to a BeautifulSoup object.

def find_label_span_by_regex(soup, regex) -> bs4.element.Tag | None:
42def find_label_span_by_regex(soup, regex) -> bs4.Tag | None:
43    """
44    Finds a <span> whose text matches the given regex.
45    """
46    return soup.find("span", string=re.compile(regex, re.I))

Finds a whose text matches the given regex.

def extract_value_from_label(label_span):
48def extract_value_from_label(label_span):
49    """
50    Extracts the value associated with a label span.
51    It first looks for the next sibling <span>, and if not found, it checks the same <li> for a second <span>.
52    """
53    if not label_span:
54        return ""
55    # preferred: the following sibling <span>
56    val_span = label_span.find_next_sibling("span")
57    if val_span and val_span.get_text(strip=True):
58        return val_span.get_text(strip=True)
59    # fallback: same <li> second span
60    li = label_span.find_parent("li")
61    if li:
62        spans = li.find_all("span")
63        if len(spans) >= 2 and spans[1].get_text(strip=True):
64            return spans[1].get_text(strip=True)
65    return ""

Extracts the value associated with a label span. It first looks for the next sibling , and if not found, it checks the same

  • for a second .

  • def any_text_coresponds_to(text, texts) -> bool:
    67def any_text_coresponds_to(text, texts) -> bool:
    68    """
    69    Check if any text corresponds to the given texts.
    70    """
    71    return any([t in text for t in texts])
    

    Check if any text corresponds to the given texts.

    class InsufficientTimeoutError(builtins.Exception):
    73class InsufficientTimeoutError(Exception):
    74    """
    75    Exception raised for insufficient timeout.
    76    """
    77    def __init__(self, message="Timeout is too short."):
    78        self.message = message
    79        super().__init__(self.message)
    

    Exception raised for insufficient timeout.

    InsufficientTimeoutError(message='Timeout is too short.')
    77    def __init__(self, message="Timeout is too short."):
    78        self.message = message
    79        super().__init__(self.message)
    
    message