prehrajto
1import bs4 2import re 3import logging 4import requests 5import urllib.parse 6from basic_colors import * 7from src.downloader.page_search import * 8from src.link_to_file import Link_to_file 9 10class Prehrajto_downloader(Download_page_search): 11 """ 12 Downloader from: prehraj.to 13 """ 14 webpage = "https://prehraj.to" 15 16 logger = logging.getLogger("Prehrajto_downloader") 17 if not logger.hasHandlers(): 18 os.makedirs("logs", exist_ok=True) 19 handler = logging.FileHandler("logs/prehrajto_downloader.log", encoding="utf-8") 20 formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s") 21 handler.setFormatter(formatter) 22 logger.addHandler(handler) 23 logger.setLevel(logging.INFO) 24 25 def __init__(self): 26 pass 27 28 @staticmethod 29 def generate_search_url(prompt, file_type="video", search_type=None): 30 """ 31 Vygeneruje URL pro hledání na prehraj.to. 32 """ 33 prompt = prompt.strip().replace(" ", "%20") 34 return f"https://prehraj.to/hledej/{prompt}" 35 36 def search(self, prompt, file_type="all", search_type="relevance") -> 'Generator[Link_to_file, None, None]': 37 """ 38 Search for files on Datoid.cz. 39 Returns a generator of Link_to_file objects. 40 """ 41 if prompt is None or prompt.strip() == "": 42 raise ValueError("Prompt cannot be empty.") 43 url = Prehrajto_downloader.generate_search_url(prompt, file_type, search_type) 44 Prehrajto_downloader.logger.info(f"Searching Prehrajto with URL: {url}") 45 response = requests.get(url) 46 page = response.text 47 if response.status_code != 200: 48 raise ValueError(f"Failed to retrieve search results, status code: {response.status_code} for URL: {url}") 49 return Prehrajto_downloader.parse_catalogue(page) 50 51 @staticmethod 52 def is_valid_download_page(page) -> bool: 53 """ 54 Returns False when the page indicates the video is still being processed, 55 e.g. contains: <div class="status status--success text-center"> Video se zpracovává </div> 56 Accepts either a requests.Response-like object or raw HTML/text. 57 """ 58 # Reject non-200 responses 59 if hasattr(page, "status_code") and page.status_code != 200: 60 return False 61 62 # Get HTML/text 63 if hasattr(page, "text"): 64 html = page.text or "" 65 else: 66 html = page or "" 67 68 if isinstance(html, bytes): 69 try: 70 html = html.decode("utf-8", errors="ignore") 71 except Exception: 72 html = str(html) 73 74 soup = bs4.BeautifulSoup(html, "html.parser") 75 76 # remove script/style to avoid false matches 77 for tag in soup(["script", "style"]): 78 tag.decompose() 79 80 # Check status-like blocks first 81 status_divs = soup.find_all("div", class_=re.compile(r"\bstatus\b")) 82 for d in status_divs: 83 if "video se zpracov" in d.get_text(" ", strip=True).lower(): 84 return False 85 86 # Fallback: check whole page text for processing indicators 87 invalid_texts = ( 88 "Video se zpracovává", 89 "video se zpracov", 90 "zpracováv" 91 ) 92 93 soup = remove_style(soup) 94 page_text = soup.get_text(" ", strip=True).lower() 95 if page_text is not None: 96 text = remove_empty_lines(page_text) 97 if any_text_coresponds_to(text, invalid_texts): 98 return False 99 100 # Ensure there is a download anchor present, e.g. 101 # <a id="frame" href="... ?do=download" class="button cta ...">Stáhnout soubor</a> 102 a_frame = soup.find("a", id="frame") 103 if not a_frame: 104 # try fallback: button/cta anchor containing 'stáhnout' or '?do=download' in href 105 a_frame = soup.find("a", class_=re.compile(r"\b(button|cta)\b"), string=re.compile(r"stáhnout", re.I)) 106 107 if not a_frame: 108 return False 109 110 href = a_frame.get("href", "") or "" 111 if "?do=download" not in href and "do=download" not in href: 112 # sometimes href could be absolute or contain params; if no download param, treat as invalid 113 return False 114 115 return True 116 117 118 @staticmethod 119 def get_atributes_from_file_page(soup) -> "Link_to_file": 120 """ 121 Parse file page parameters and return Link_to_file(title, url, size, Prehrajto_downloader). 122 Accepts either a BeautifulSoup object, a requests.Response, or raw HTML/text. 123 """ 124 # Accept Response or raw HTML as input — normalize to BeautifulSoup 125 soup = normalize_to_beautifulsoup(soup) 126 127 name_label = find_label_span_by_regex(soup, r'^\s*Název souboru[:\s]*$') 128 size_label = find_label_span_by_regex(soup, r'^\s*Velikost[:\s]*$') 129 format_label = find_label_span_by_regex(soup, r'^\s*Formát[:\s]*$') 130 131 name = extract_value_from_label(name_label) 132 size = extract_value_from_label(size_label) 133 fmt = extract_value_from_label(format_label).lower().strip() 134 135 # Normalize filename: append extension if format present and not already there 136 if fmt and name and not re.search(r'\.' + re.escape(fmt) + r'\s*$', name, re.I): 137 name = f"{name}.{fmt}" 138 139 # Get download anchor href and strip query/fragment (detail URL should NOT contain ?do=download) 140 detail_url = "" 141 a_frame = soup.find("a", id="frame") 142 if a_frame: 143 href = a_frame.get("href") or "" 144 # remove query and fragment 145 parsed = urllib.parse.urlparse(href) 146 clean = urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", "")) 147 # If relative, prepend domain 148 if not clean.startswith("http"): 149 if clean.startswith("/"): 150 detail_url = f"https://prehraj.to{clean}" 151 elif clean: 152 detail_url = f"https://prehraj.to/{clean}" 153 else: 154 detail_url = clean 155 156 return Link_to_file(name, detail_url, size, Prehrajto_downloader) 157 158 @staticmethod 159 def get_atributes_from_catalogue(soup) -> Link_to_file: 160 """ 161 Získá informace o videu z katalogového HTML elementu (soup) a vrátí Link_to_file. 162 """ 163 a_tag = soup.find("a", class_="video--link") 164 if not a_tag: 165 raise ValueError("Unable to find video link in the provided HTML element.") 166 title = a_tag.get("title") or a_tag.find("h3", class_="video__title").text.strip() 167 href = a_tag.get("href") 168 detail_url = f"https://prehraj.to{href}" if href and not href.startswith("http") else href 169 170 # Najdi velikost souboru 171 size_div = a_tag.find("div", class_="video__tag--size") 172 size = size_div.text.strip() if size_div else "" 173 174 return Link_to_file(title, detail_url, size, Prehrajto_downloader) 175 176 @staticmethod 177 def parse_catalogue(page) -> 'Generator[Link_to_file, None, None]': 178 """ 179 Prochází stránku s výsledky vyhledávání (nový formát s <div>) a vrací informace o souborech. 180 yield: Link_to_file 181 """ 182 183 def find_next_url(soup_obj): 184 """ 185 <a href="/hledej/zakl%C3%ADna%C4%8D?vp-page=2" title="Zobrazit další" class="button cta cta--small">Zobrazit další</a> 186 """ 187 a = soup_obj.find("a", class_=re.compile(r"\bbutton\b.*\bcta\b.*\bcta--small\b"), string=re.compile(r"Zobrazit další", re.I)) 188 if not a: 189 return None 190 href = a.get("href", "").strip() 191 if not href: 192 return None 193 return urllib.parse.urljoin(Prehrajto_downloader.webpage, href) 194 195 def process_soup_and_yield(soup_obj): 196 grids = soup_obj.find_all("div", class_="grid-x") 197 for grid in grids: 198 for div in grid.find_all("div", recursive=False): 199 a_tag = div.find("a", class_="video--link") 200 link_2_file = None 201 if a_tag: 202 try: 203 link_2_file = Prehrajto_downloader.get_atributes_from_catalogue(div) 204 detail_page = download_page(link_2_file.detail_url) 205 if not Prehrajto_downloader.is_valid_download_page(detail_page): 206 raise ValueError(f"Status code: {detail_page.status_code}. Invalid download page: no file to download.") 207 link_2_file = Prehrajto_downloader.get_atributes_from_file_page(detail_page) 208 if link_2_file: 209 yield link_2_file 210 except ValueError as e: 211 print_error(f"{str(e)} for file: {(link_2_file.title if link_2_file else 'Unknown')}", False) 212 213 soup = bs4.BeautifulSoup(page, "html.parser") 214 yield from process_soup_and_yield(soup) 215 216 next_url = find_next_url(soup) 217 if next_url: 218 try: 219 resp = requests.get(next_url) 220 if resp.status_code != 200: 221 print_error(f"Failed to retrieve search results, status code: {resp.status_code} for URL: {next_url}", False) 222 else: 223 #TODO: remove recursion. Use loop instead. 224 yield from Prehrajto_downloader.parse_catalogue(resp.text) 225 except Exception as e: 226 print_error(f"Failed to fetch next page {next_url}: {e}", False) 227 228 @staticmethod 229 def get_download_link_from_detail(detail_url: str) -> str: 230 return f"{detail_url}?do=download"
class
Prehrajto_downloader(src.downloader.page_search.Download_page_search):
11class Prehrajto_downloader(Download_page_search): 12 """ 13 Downloader from: prehraj.to 14 """ 15 webpage = "https://prehraj.to" 16 17 logger = logging.getLogger("Prehrajto_downloader") 18 if not logger.hasHandlers(): 19 os.makedirs("logs", exist_ok=True) 20 handler = logging.FileHandler("logs/prehrajto_downloader.log", encoding="utf-8") 21 formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s") 22 handler.setFormatter(formatter) 23 logger.addHandler(handler) 24 logger.setLevel(logging.INFO) 25 26 def __init__(self): 27 pass 28 29 @staticmethod 30 def generate_search_url(prompt, file_type="video", search_type=None): 31 """ 32 Vygeneruje URL pro hledání na prehraj.to. 33 """ 34 prompt = prompt.strip().replace(" ", "%20") 35 return f"https://prehraj.to/hledej/{prompt}" 36 37 def search(self, prompt, file_type="all", search_type="relevance") -> 'Generator[Link_to_file, None, None]': 38 """ 39 Search for files on Datoid.cz. 40 Returns a generator of Link_to_file objects. 41 """ 42 if prompt is None or prompt.strip() == "": 43 raise ValueError("Prompt cannot be empty.") 44 url = Prehrajto_downloader.generate_search_url(prompt, file_type, search_type) 45 Prehrajto_downloader.logger.info(f"Searching Prehrajto with URL: {url}") 46 response = requests.get(url) 47 page = response.text 48 if response.status_code != 200: 49 raise ValueError(f"Failed to retrieve search results, status code: {response.status_code} for URL: {url}") 50 return Prehrajto_downloader.parse_catalogue(page) 51 52 @staticmethod 53 def is_valid_download_page(page) -> bool: 54 """ 55 Returns False when the page indicates the video is still being processed, 56 e.g. contains: <div class="status status--success text-center"> Video se zpracovává </div> 57 Accepts either a requests.Response-like object or raw HTML/text. 58 """ 59 # Reject non-200 responses 60 if hasattr(page, "status_code") and page.status_code != 200: 61 return False 62 63 # Get HTML/text 64 if hasattr(page, "text"): 65 html = page.text or "" 66 else: 67 html = page or "" 68 69 if isinstance(html, bytes): 70 try: 71 html = html.decode("utf-8", errors="ignore") 72 except Exception: 73 html = str(html) 74 75 soup = bs4.BeautifulSoup(html, "html.parser") 76 77 # remove script/style to avoid false matches 78 for tag in soup(["script", "style"]): 79 tag.decompose() 80 81 # Check status-like blocks first 82 status_divs = soup.find_all("div", class_=re.compile(r"\bstatus\b")) 83 for d in status_divs: 84 if "video se zpracov" in d.get_text(" ", strip=True).lower(): 85 return False 86 87 # Fallback: check whole page text for processing indicators 88 invalid_texts = ( 89 "Video se zpracovává", 90 "video se zpracov", 91 "zpracováv" 92 ) 93 94 soup = remove_style(soup) 95 page_text = soup.get_text(" ", strip=True).lower() 96 if page_text is not None: 97 text = remove_empty_lines(page_text) 98 if any_text_coresponds_to(text, invalid_texts): 99 return False 100 101 # Ensure there is a download anchor present, e.g. 102 # <a id="frame" href="... ?do=download" class="button cta ...">Stáhnout soubor</a> 103 a_frame = soup.find("a", id="frame") 104 if not a_frame: 105 # try fallback: button/cta anchor containing 'stáhnout' or '?do=download' in href 106 a_frame = soup.find("a", class_=re.compile(r"\b(button|cta)\b"), string=re.compile(r"stáhnout", re.I)) 107 108 if not a_frame: 109 return False 110 111 href = a_frame.get("href", "") or "" 112 if "?do=download" not in href and "do=download" not in href: 113 # sometimes href could be absolute or contain params; if no download param, treat as invalid 114 return False 115 116 return True 117 118 119 @staticmethod 120 def get_atributes_from_file_page(soup) -> "Link_to_file": 121 """ 122 Parse file page parameters and return Link_to_file(title, url, size, Prehrajto_downloader). 123 Accepts either a BeautifulSoup object, a requests.Response, or raw HTML/text. 124 """ 125 # Accept Response or raw HTML as input — normalize to BeautifulSoup 126 soup = normalize_to_beautifulsoup(soup) 127 128 name_label = find_label_span_by_regex(soup, r'^\s*Název souboru[:\s]*$') 129 size_label = find_label_span_by_regex(soup, r'^\s*Velikost[:\s]*$') 130 format_label = find_label_span_by_regex(soup, r'^\s*Formát[:\s]*$') 131 132 name = extract_value_from_label(name_label) 133 size = extract_value_from_label(size_label) 134 fmt = extract_value_from_label(format_label).lower().strip() 135 136 # Normalize filename: append extension if format present and not already there 137 if fmt and name and not re.search(r'\.' + re.escape(fmt) + r'\s*$', name, re.I): 138 name = f"{name}.{fmt}" 139 140 # Get download anchor href and strip query/fragment (detail URL should NOT contain ?do=download) 141 detail_url = "" 142 a_frame = soup.find("a", id="frame") 143 if a_frame: 144 href = a_frame.get("href") or "" 145 # remove query and fragment 146 parsed = urllib.parse.urlparse(href) 147 clean = urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", "")) 148 # If relative, prepend domain 149 if not clean.startswith("http"): 150 if clean.startswith("/"): 151 detail_url = f"https://prehraj.to{clean}" 152 elif clean: 153 detail_url = f"https://prehraj.to/{clean}" 154 else: 155 detail_url = clean 156 157 return Link_to_file(name, detail_url, size, Prehrajto_downloader) 158 159 @staticmethod 160 def get_atributes_from_catalogue(soup) -> Link_to_file: 161 """ 162 Získá informace o videu z katalogového HTML elementu (soup) a vrátí Link_to_file. 163 """ 164 a_tag = soup.find("a", class_="video--link") 165 if not a_tag: 166 raise ValueError("Unable to find video link in the provided HTML element.") 167 title = a_tag.get("title") or a_tag.find("h3", class_="video__title").text.strip() 168 href = a_tag.get("href") 169 detail_url = f"https://prehraj.to{href}" if href and not href.startswith("http") else href 170 171 # Najdi velikost souboru 172 size_div = a_tag.find("div", class_="video__tag--size") 173 size = size_div.text.strip() if size_div else "" 174 175 return Link_to_file(title, detail_url, size, Prehrajto_downloader) 176 177 @staticmethod 178 def parse_catalogue(page) -> 'Generator[Link_to_file, None, None]': 179 """ 180 Prochází stránku s výsledky vyhledávání (nový formát s <div>) a vrací informace o souborech. 181 yield: Link_to_file 182 """ 183 184 def find_next_url(soup_obj): 185 """ 186 <a href="/hledej/zakl%C3%ADna%C4%8D?vp-page=2" title="Zobrazit další" class="button cta cta--small">Zobrazit další</a> 187 """ 188 a = soup_obj.find("a", class_=re.compile(r"\bbutton\b.*\bcta\b.*\bcta--small\b"), string=re.compile(r"Zobrazit další", re.I)) 189 if not a: 190 return None 191 href = a.get("href", "").strip() 192 if not href: 193 return None 194 return urllib.parse.urljoin(Prehrajto_downloader.webpage, href) 195 196 def process_soup_and_yield(soup_obj): 197 grids = soup_obj.find_all("div", class_="grid-x") 198 for grid in grids: 199 for div in grid.find_all("div", recursive=False): 200 a_tag = div.find("a", class_="video--link") 201 link_2_file = None 202 if a_tag: 203 try: 204 link_2_file = Prehrajto_downloader.get_atributes_from_catalogue(div) 205 detail_page = download_page(link_2_file.detail_url) 206 if not Prehrajto_downloader.is_valid_download_page(detail_page): 207 raise ValueError(f"Status code: {detail_page.status_code}. Invalid download page: no file to download.") 208 link_2_file = Prehrajto_downloader.get_atributes_from_file_page(detail_page) 209 if link_2_file: 210 yield link_2_file 211 except ValueError as e: 212 print_error(f"{str(e)} for file: {(link_2_file.title if link_2_file else 'Unknown')}", False) 213 214 soup = bs4.BeautifulSoup(page, "html.parser") 215 yield from process_soup_and_yield(soup) 216 217 next_url = find_next_url(soup) 218 if next_url: 219 try: 220 resp = requests.get(next_url) 221 if resp.status_code != 200: 222 print_error(f"Failed to retrieve search results, status code: {resp.status_code} for URL: {next_url}", False) 223 else: 224 #TODO: remove recursion. Use loop instead. 225 yield from Prehrajto_downloader.parse_catalogue(resp.text) 226 except Exception as e: 227 print_error(f"Failed to fetch next page {next_url}: {e}", False) 228 229 @staticmethod 230 def get_download_link_from_detail(detail_url: str) -> str: 231 return f"{detail_url}?do=download"
Downloader from: prehraj.to
@staticmethod
def
generate_search_url(prompt, file_type='video', search_type=None):
29 @staticmethod 30 def generate_search_url(prompt, file_type="video", search_type=None): 31 """ 32 Vygeneruje URL pro hledání na prehraj.to. 33 """ 34 prompt = prompt.strip().replace(" ", "%20") 35 return f"https://prehraj.to/hledej/{prompt}"
Vygeneruje URL pro hledání na prehraj.to.
def
search( self, prompt, file_type='all', search_type='relevance') -> Generator[src.link_to_file.Link_to_file, NoneType, NoneType]:
37 def search(self, prompt, file_type="all", search_type="relevance") -> 'Generator[Link_to_file, None, None]': 38 """ 39 Search for files on Datoid.cz. 40 Returns a generator of Link_to_file objects. 41 """ 42 if prompt is None or prompt.strip() == "": 43 raise ValueError("Prompt cannot be empty.") 44 url = Prehrajto_downloader.generate_search_url(prompt, file_type, search_type) 45 Prehrajto_downloader.logger.info(f"Searching Prehrajto with URL: {url}") 46 response = requests.get(url) 47 page = response.text 48 if response.status_code != 200: 49 raise ValueError(f"Failed to retrieve search results, status code: {response.status_code} for URL: {url}") 50 return Prehrajto_downloader.parse_catalogue(page)
Search for files on Datoid.cz. Returns a generator of Link_to_file objects.
@staticmethod
def
is_valid_download_page(page) -> bool:
52 @staticmethod 53 def is_valid_download_page(page) -> bool: 54 """ 55 Returns False when the page indicates the video is still being processed, 56 e.g. contains: <div class="status status--success text-center"> Video se zpracovává </div> 57 Accepts either a requests.Response-like object or raw HTML/text. 58 """ 59 # Reject non-200 responses 60 if hasattr(page, "status_code") and page.status_code != 200: 61 return False 62 63 # Get HTML/text 64 if hasattr(page, "text"): 65 html = page.text or "" 66 else: 67 html = page or "" 68 69 if isinstance(html, bytes): 70 try: 71 html = html.decode("utf-8", errors="ignore") 72 except Exception: 73 html = str(html) 74 75 soup = bs4.BeautifulSoup(html, "html.parser") 76 77 # remove script/style to avoid false matches 78 for tag in soup(["script", "style"]): 79 tag.decompose() 80 81 # Check status-like blocks first 82 status_divs = soup.find_all("div", class_=re.compile(r"\bstatus\b")) 83 for d in status_divs: 84 if "video se zpracov" in d.get_text(" ", strip=True).lower(): 85 return False 86 87 # Fallback: check whole page text for processing indicators 88 invalid_texts = ( 89 "Video se zpracovává", 90 "video se zpracov", 91 "zpracováv" 92 ) 93 94 soup = remove_style(soup) 95 page_text = soup.get_text(" ", strip=True).lower() 96 if page_text is not None: 97 text = remove_empty_lines(page_text) 98 if any_text_coresponds_to(text, invalid_texts): 99 return False 100 101 # Ensure there is a download anchor present, e.g. 102 # <a id="frame" href="... ?do=download" class="button cta ...">Stáhnout soubor</a> 103 a_frame = soup.find("a", id="frame") 104 if not a_frame: 105 # try fallback: button/cta anchor containing 'stáhnout' or '?do=download' in href 106 a_frame = soup.find("a", class_=re.compile(r"\b(button|cta)\b"), string=re.compile(r"stáhnout", re.I)) 107 108 if not a_frame: 109 return False 110 111 href = a_frame.get("href", "") or "" 112 if "?do=download" not in href and "do=download" not in href: 113 # sometimes href could be absolute or contain params; if no download param, treat as invalid 114 return False 115 116 return True
Returns False when the page indicates the video is still being processed, e.g. contains:
Video se zpracovává
Accepts either a requests.Response-like object or raw HTML/text.
@staticmethod
def
get_atributes_from_file_page(soup) -> src.link_to_file.Link_to_file:
119 @staticmethod 120 def get_atributes_from_file_page(soup) -> "Link_to_file": 121 """ 122 Parse file page parameters and return Link_to_file(title, url, size, Prehrajto_downloader). 123 Accepts either a BeautifulSoup object, a requests.Response, or raw HTML/text. 124 """ 125 # Accept Response or raw HTML as input — normalize to BeautifulSoup 126 soup = normalize_to_beautifulsoup(soup) 127 128 name_label = find_label_span_by_regex(soup, r'^\s*Název souboru[:\s]*$') 129 size_label = find_label_span_by_regex(soup, r'^\s*Velikost[:\s]*$') 130 format_label = find_label_span_by_regex(soup, r'^\s*Formát[:\s]*$') 131 132 name = extract_value_from_label(name_label) 133 size = extract_value_from_label(size_label) 134 fmt = extract_value_from_label(format_label).lower().strip() 135 136 # Normalize filename: append extension if format present and not already there 137 if fmt and name and not re.search(r'\.' + re.escape(fmt) + r'\s*$', name, re.I): 138 name = f"{name}.{fmt}" 139 140 # Get download anchor href and strip query/fragment (detail URL should NOT contain ?do=download) 141 detail_url = "" 142 a_frame = soup.find("a", id="frame") 143 if a_frame: 144 href = a_frame.get("href") or "" 145 # remove query and fragment 146 parsed = urllib.parse.urlparse(href) 147 clean = urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", "")) 148 # If relative, prepend domain 149 if not clean.startswith("http"): 150 if clean.startswith("/"): 151 detail_url = f"https://prehraj.to{clean}" 152 elif clean: 153 detail_url = f"https://prehraj.to/{clean}" 154 else: 155 detail_url = clean 156 157 return Link_to_file(name, detail_url, size, Prehrajto_downloader)
Parse file page parameters and return Link_to_file(title, url, size, Prehrajto_downloader). Accepts either a BeautifulSoup object, a requests.Response, or raw HTML/text.
@staticmethod
def
get_atributes_from_catalogue(soup) -> src.link_to_file.Link_to_file:
159 @staticmethod 160 def get_atributes_from_catalogue(soup) -> Link_to_file: 161 """ 162 Získá informace o videu z katalogového HTML elementu (soup) a vrátí Link_to_file. 163 """ 164 a_tag = soup.find("a", class_="video--link") 165 if not a_tag: 166 raise ValueError("Unable to find video link in the provided HTML element.") 167 title = a_tag.get("title") or a_tag.find("h3", class_="video__title").text.strip() 168 href = a_tag.get("href") 169 detail_url = f"https://prehraj.to{href}" if href and not href.startswith("http") else href 170 171 # Najdi velikost souboru 172 size_div = a_tag.find("div", class_="video__tag--size") 173 size = size_div.text.strip() if size_div else "" 174 175 return Link_to_file(title, detail_url, size, Prehrajto_downloader)
Získá informace o videu z katalogového HTML elementu (soup) a vrátí Link_to_file.
@staticmethod
def
parse_catalogue(page) -> Generator[src.link_to_file.Link_to_file, NoneType, NoneType]:
177 @staticmethod 178 def parse_catalogue(page) -> 'Generator[Link_to_file, None, None]': 179 """ 180 Prochází stránku s výsledky vyhledávání (nový formát s <div>) a vrací informace o souborech. 181 yield: Link_to_file 182 """ 183 184 def find_next_url(soup_obj): 185 """ 186 <a href="/hledej/zakl%C3%ADna%C4%8D?vp-page=2" title="Zobrazit další" class="button cta cta--small">Zobrazit další</a> 187 """ 188 a = soup_obj.find("a", class_=re.compile(r"\bbutton\b.*\bcta\b.*\bcta--small\b"), string=re.compile(r"Zobrazit další", re.I)) 189 if not a: 190 return None 191 href = a.get("href", "").strip() 192 if not href: 193 return None 194 return urllib.parse.urljoin(Prehrajto_downloader.webpage, href) 195 196 def process_soup_and_yield(soup_obj): 197 grids = soup_obj.find_all("div", class_="grid-x") 198 for grid in grids: 199 for div in grid.find_all("div", recursive=False): 200 a_tag = div.find("a", class_="video--link") 201 link_2_file = None 202 if a_tag: 203 try: 204 link_2_file = Prehrajto_downloader.get_atributes_from_catalogue(div) 205 detail_page = download_page(link_2_file.detail_url) 206 if not Prehrajto_downloader.is_valid_download_page(detail_page): 207 raise ValueError(f"Status code: {detail_page.status_code}. Invalid download page: no file to download.") 208 link_2_file = Prehrajto_downloader.get_atributes_from_file_page(detail_page) 209 if link_2_file: 210 yield link_2_file 211 except ValueError as e: 212 print_error(f"{str(e)} for file: {(link_2_file.title if link_2_file else 'Unknown')}", False) 213 214 soup = bs4.BeautifulSoup(page, "html.parser") 215 yield from process_soup_and_yield(soup) 216 217 next_url = find_next_url(soup) 218 if next_url: 219 try: 220 resp = requests.get(next_url) 221 if resp.status_code != 200: 222 print_error(f"Failed to retrieve search results, status code: {resp.status_code} for URL: {next_url}", False) 223 else: 224 #TODO: remove recursion. Use loop instead. 225 yield from Prehrajto_downloader.parse_catalogue(resp.text) 226 except Exception as e: 227 print_error(f"Failed to fetch next page {next_url}: {e}", False)
Prochází stránku s výsledky vyhledávání (nový formát s
) a vrací informace o souborech.
yield: Link_to_file