sdilej
1from __future__ import annotations 2import logging 3import bs4 4from src.download import * 5from basic_colors import * 6from src.downloader.page_search import * 7from src.link_to_file import Link_to_file, compare_sizes 8 9class Sdilej_downloader(Download_page_search): 10 """ 11 Downloader from: sdilej.cz 12 """ 13 webpage = "https://sdilej.cz" 14 15 logger = logging.getLogger("Sdilej_downloader") 16 if not logger.hasHandlers(): 17 os.makedirs("logs", exist_ok=True) 18 handler = logging.FileHandler("logs/sdilej_downloader.log", encoding="utf-8") 19 formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s") 20 handler.setFormatter(formatter) 21 logger.addHandler(handler) 22 logger.setLevel(logging.INFO) 23 24 def __init__(self): 25 pass 26 27 def search(self, prompt, file_type="all", search_type="relevance") -> 'Generator[Link_to_file, None, None]': 28 if prompt is None or prompt.strip() == "": 29 raise ValueError("Prompt cannot be empty.") 30 url = Sdilej_downloader.generate_search_url(prompt, file_type, search_type) 31 Sdilej_downloader.logger.info(f"Searching Sdilej with URL: {url}") 32 response = requests.get(url) 33 Sdilej_downloader.logger.info(f"Response received: {response.status_code}") 34 return Sdilej_downloader.parse_catalogue(response) 35 36 @staticmethod 37 def generate_search_url(prompt, file_type="all", search_type="relevance"): 38 """ 39 generate url from input 40 """ 41 return f"{Sdilej_downloader.webpage}/{prompt}/s/{Sdilej_downloader.file_types[file_type]}-{Sdilej_downloader.search_types[search_type]}" 42 43 @staticmethod 44 def get_atributes_from_catalogue(soup) -> Link_to_file: 45 try: 46 link = soup.find("a").get("href") 47 title = soup.find("a").get("title") 48 size = soup.find_all("p")[1].text 49 link_2_file = Link_to_file(title, link, size, Sdilej_downloader) 50 except Exception as e: 51 raise ValueError("ERROR: unable to parse atributes." + str(e)) 52 return link_2_file 53 54 @staticmethod 55 def get_atributes_from_file_page(soup) -> Link_to_file: 56 try: 57 title = soup.find("h1").text 58 size = soup.find("b").next_sibling.replace("|", "").strip() 59 link = Sdilej_downloader.webpage+str(soup.find("a", class_="btn btn-danger").get("href")) 60 link_2_file = Link_to_file(title, link, size, Sdilej_downloader) 61 except Exception as e: 62 raise ValueError("Download button not found on detail page." + str(e)) 63 return link_2_file 64 65 @staticmethod 66 def get_download_link_from_detail(detail_url: str) -> str: 67 """ 68 Získá přímý odkaz ke stažení ze stránky s detailem souboru na sdilej.cz. 69 """ 70 page = download_page(detail_url) 71 soup = bs4.BeautifulSoup(page.text, "html.parser") 72 # Najdi tlačítko pro stažení 73 download_btn = soup.find("a", class_="btn btn-danger") 74 if not download_btn: 75 raise ValueError("Download button not found on detail page for: {}".format(detail_url)) 76 download_link = Sdilej_downloader.webpage + str(download_btn.get("href")) 77 return download_link 78 79 @staticmethod 80 def is_valid_download_page(page) -> bool: 81 """ 82 Stránka neplatná, pokud obsahuje: 83 <h1 class="red">Stahuj a nahrávej soubory neomezenou rychlostí</h1> 84 "Tento soubor byl smazán." 85 """ 86 soup = bs4.BeautifulSoup(page.text, "html.parser") 87 invalid_texts = ( 88 "Stahuj a nahrávej soubory neomezenou rychlostí", 89 "Chyba 404 Nenalezeno", 90 "Tento soubor byl smazán." 91 ) 92 page_title = soup.find("h1", class_="red") 93 if page_title is not None and page_title.text in invalid_texts: 94 return False 95 96 soup = remove_style(soup) 97 page_txt = soup.find("div", class_="content") 98 if page_txt is not None: 99 text = remove_empty_lines(page_txt.text) 100 if any_text_coresponds_to(text, invalid_texts): 101 return False 102 return True 103 104 @staticmethod 105 def test_downloaded_file(link_2_file, download_folder) -> bool: 106 file_size = os.path.getsize(f"{download_folder}/{link_2_file.title}") 107 if file_size == 0: 108 raise ValueError("ERROR: File is empty.") 109 elif link_2_file.size != None and file_size < 1024: 110 file = os.path.join(download_folder, link_2_file.title) 111 data = open(file, "r", encoding='utf-8').read() 112 return Sdilej_downloader.test_downloaded_data(data) 113 elif link_2_file.size != None and not compare_sizes(file_size, link_2_file.size, 20/100): 114 raise ValueError("ERROR: File size does not match.") 115 return True 116 117 @staticmethod 118 def test_downloaded_data(data) -> bool: 119 """ 120 Tests the downloaded data. 121 The data is invalid if a sufficient timeout has occurred. 122 If the page contains: 123 "<script>top.location.href='https://sdilej.cz/free-stahovani';</script>" 124 "<h1 class=\"red\">Stahování více souborů najednou</h1>" 125 """ 126 if data is None: 127 raise ValueError("ERROR: No data downloaded.") 128 if "<script>top.location.href='https://sdilej.cz/free-stahovani';</script>" in data: 129 raise InsufficientTimeoutError() 130 if "<h1 class=\"red\">Stahování více souborů najednou</h1>" in data: 131 raise InsufficientTimeoutError() 132 return True 133 134 @staticmethod 135 def parse_file_page(page): 136 if not Sdilej_downloader.is_valid_download_page(page): 137 raise ValueError("Status code: " + str(page.status_code) + ". Invalid download page: no file to download.") 138 soup = bs4.BeautifulSoup(page.text, "html.parser") 139 content = soup.find("div", class_="content") 140 content = soup.find("div", class_="col-md-12 col-sm-12 detail-leftcol") 141 return content 142 143 @staticmethod 144 def parse_catalogue(page) -> 'Generator[Link_to_file, None, None]': 145 """ 146 Iterates through the search results page and returns information about the files. 147 148 Yields: Link_to_file 149 """ 150 soup = bs4.BeautifulSoup(page.text, "html.parser") 151 content = soup.find("div", class_="row post") 152 if content is None: 153 return None 154 content = remove_style(content) 155 for videobox in content.find_all(class_="videobox-desc"): 156 catalogue_file = None 157 try: 158 catalogue_file = Sdilej_downloader.get_atributes_from_catalogue(videobox) 159 download_page_content = Sdilej_downloader.parse_file_page(download_page(catalogue_file.detail_url)) 160 link_2_file = Sdilej_downloader.get_atributes_from_file_page(download_page_content) 161 link_2_file.detail_url = catalogue_file.detail_url # zachovej původní detail_url! 162 yield link_2_file 163 except ValueError as e: 164 print_error(str(e) + " for file: " + (catalogue_file.title if catalogue_file else "Unknown"), False)
class
Sdilej_downloader(src.downloader.page_search.Download_page_search):
10class Sdilej_downloader(Download_page_search): 11 """ 12 Downloader from: sdilej.cz 13 """ 14 webpage = "https://sdilej.cz" 15 16 logger = logging.getLogger("Sdilej_downloader") 17 if not logger.hasHandlers(): 18 os.makedirs("logs", exist_ok=True) 19 handler = logging.FileHandler("logs/sdilej_downloader.log", encoding="utf-8") 20 formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s") 21 handler.setFormatter(formatter) 22 logger.addHandler(handler) 23 logger.setLevel(logging.INFO) 24 25 def __init__(self): 26 pass 27 28 def search(self, prompt, file_type="all", search_type="relevance") -> 'Generator[Link_to_file, None, None]': 29 if prompt is None or prompt.strip() == "": 30 raise ValueError("Prompt cannot be empty.") 31 url = Sdilej_downloader.generate_search_url(prompt, file_type, search_type) 32 Sdilej_downloader.logger.info(f"Searching Sdilej with URL: {url}") 33 response = requests.get(url) 34 Sdilej_downloader.logger.info(f"Response received: {response.status_code}") 35 return Sdilej_downloader.parse_catalogue(response) 36 37 @staticmethod 38 def generate_search_url(prompt, file_type="all", search_type="relevance"): 39 """ 40 generate url from input 41 """ 42 return f"{Sdilej_downloader.webpage}/{prompt}/s/{Sdilej_downloader.file_types[file_type]}-{Sdilej_downloader.search_types[search_type]}" 43 44 @staticmethod 45 def get_atributes_from_catalogue(soup) -> Link_to_file: 46 try: 47 link = soup.find("a").get("href") 48 title = soup.find("a").get("title") 49 size = soup.find_all("p")[1].text 50 link_2_file = Link_to_file(title, link, size, Sdilej_downloader) 51 except Exception as e: 52 raise ValueError("ERROR: unable to parse atributes." + str(e)) 53 return link_2_file 54 55 @staticmethod 56 def get_atributes_from_file_page(soup) -> Link_to_file: 57 try: 58 title = soup.find("h1").text 59 size = soup.find("b").next_sibling.replace("|", "").strip() 60 link = Sdilej_downloader.webpage+str(soup.find("a", class_="btn btn-danger").get("href")) 61 link_2_file = Link_to_file(title, link, size, Sdilej_downloader) 62 except Exception as e: 63 raise ValueError("Download button not found on detail page." + str(e)) 64 return link_2_file 65 66 @staticmethod 67 def get_download_link_from_detail(detail_url: str) -> str: 68 """ 69 Získá přímý odkaz ke stažení ze stránky s detailem souboru na sdilej.cz. 70 """ 71 page = download_page(detail_url) 72 soup = bs4.BeautifulSoup(page.text, "html.parser") 73 # Najdi tlačítko pro stažení 74 download_btn = soup.find("a", class_="btn btn-danger") 75 if not download_btn: 76 raise ValueError("Download button not found on detail page for: {}".format(detail_url)) 77 download_link = Sdilej_downloader.webpage + str(download_btn.get("href")) 78 return download_link 79 80 @staticmethod 81 def is_valid_download_page(page) -> bool: 82 """ 83 Stránka neplatná, pokud obsahuje: 84 <h1 class="red">Stahuj a nahrávej soubory neomezenou rychlostí</h1> 85 "Tento soubor byl smazán." 86 """ 87 soup = bs4.BeautifulSoup(page.text, "html.parser") 88 invalid_texts = ( 89 "Stahuj a nahrávej soubory neomezenou rychlostí", 90 "Chyba 404 Nenalezeno", 91 "Tento soubor byl smazán." 92 ) 93 page_title = soup.find("h1", class_="red") 94 if page_title is not None and page_title.text in invalid_texts: 95 return False 96 97 soup = remove_style(soup) 98 page_txt = soup.find("div", class_="content") 99 if page_txt is not None: 100 text = remove_empty_lines(page_txt.text) 101 if any_text_coresponds_to(text, invalid_texts): 102 return False 103 return True 104 105 @staticmethod 106 def test_downloaded_file(link_2_file, download_folder) -> bool: 107 file_size = os.path.getsize(f"{download_folder}/{link_2_file.title}") 108 if file_size == 0: 109 raise ValueError("ERROR: File is empty.") 110 elif link_2_file.size != None and file_size < 1024: 111 file = os.path.join(download_folder, link_2_file.title) 112 data = open(file, "r", encoding='utf-8').read() 113 return Sdilej_downloader.test_downloaded_data(data) 114 elif link_2_file.size != None and not compare_sizes(file_size, link_2_file.size, 20/100): 115 raise ValueError("ERROR: File size does not match.") 116 return True 117 118 @staticmethod 119 def test_downloaded_data(data) -> bool: 120 """ 121 Tests the downloaded data. 122 The data is invalid if a sufficient timeout has occurred. 123 If the page contains: 124 "<script>top.location.href='https://sdilej.cz/free-stahovani';</script>" 125 "<h1 class=\"red\">Stahování více souborů najednou</h1>" 126 """ 127 if data is None: 128 raise ValueError("ERROR: No data downloaded.") 129 if "<script>top.location.href='https://sdilej.cz/free-stahovani';</script>" in data: 130 raise InsufficientTimeoutError() 131 if "<h1 class=\"red\">Stahování více souborů najednou</h1>" in data: 132 raise InsufficientTimeoutError() 133 return True 134 135 @staticmethod 136 def parse_file_page(page): 137 if not Sdilej_downloader.is_valid_download_page(page): 138 raise ValueError("Status code: " + str(page.status_code) + ". Invalid download page: no file to download.") 139 soup = bs4.BeautifulSoup(page.text, "html.parser") 140 content = soup.find("div", class_="content") 141 content = soup.find("div", class_="col-md-12 col-sm-12 detail-leftcol") 142 return content 143 144 @staticmethod 145 def parse_catalogue(page) -> 'Generator[Link_to_file, None, None]': 146 """ 147 Iterates through the search results page and returns information about the files. 148 149 Yields: Link_to_file 150 """ 151 soup = bs4.BeautifulSoup(page.text, "html.parser") 152 content = soup.find("div", class_="row post") 153 if content is None: 154 return None 155 content = remove_style(content) 156 for videobox in content.find_all(class_="videobox-desc"): 157 catalogue_file = None 158 try: 159 catalogue_file = Sdilej_downloader.get_atributes_from_catalogue(videobox) 160 download_page_content = Sdilej_downloader.parse_file_page(download_page(catalogue_file.detail_url)) 161 link_2_file = Sdilej_downloader.get_atributes_from_file_page(download_page_content) 162 link_2_file.detail_url = catalogue_file.detail_url # zachovej původní detail_url! 163 yield link_2_file 164 except ValueError as e: 165 print_error(str(e) + " for file: " + (catalogue_file.title if catalogue_file else "Unknown"), False)
Downloader from: sdilej.cz
def
search( self, prompt, file_type='all', search_type='relevance') -> Generator[src.link_to_file.Link_to_file, NoneType, NoneType]:
28 def search(self, prompt, file_type="all", search_type="relevance") -> 'Generator[Link_to_file, None, None]': 29 if prompt is None or prompt.strip() == "": 30 raise ValueError("Prompt cannot be empty.") 31 url = Sdilej_downloader.generate_search_url(prompt, file_type, search_type) 32 Sdilej_downloader.logger.info(f"Searching Sdilej with URL: {url}") 33 response = requests.get(url) 34 Sdilej_downloader.logger.info(f"Response received: {response.status_code}") 35 return Sdilej_downloader.parse_catalogue(response)
Search for files on the website.
@staticmethod
def
generate_search_url(prompt, file_type='all', search_type='relevance'):
37 @staticmethod 38 def generate_search_url(prompt, file_type="all", search_type="relevance"): 39 """ 40 generate url from input 41 """ 42 return f"{Sdilej_downloader.webpage}/{prompt}/s/{Sdilej_downloader.file_types[file_type]}-{Sdilej_downloader.search_types[search_type]}"
generate url from input
@staticmethod
def
get_atributes_from_catalogue(soup) -> src.link_to_file.Link_to_file:
44 @staticmethod 45 def get_atributes_from_catalogue(soup) -> Link_to_file: 46 try: 47 link = soup.find("a").get("href") 48 title = soup.find("a").get("title") 49 size = soup.find_all("p")[1].text 50 link_2_file = Link_to_file(title, link, size, Sdilej_downloader) 51 except Exception as e: 52 raise ValueError("ERROR: unable to parse atributes." + str(e)) 53 return link_2_file
@staticmethod
def
get_atributes_from_file_page(soup) -> src.link_to_file.Link_to_file:
55 @staticmethod 56 def get_atributes_from_file_page(soup) -> Link_to_file: 57 try: 58 title = soup.find("h1").text 59 size = soup.find("b").next_sibling.replace("|", "").strip() 60 link = Sdilej_downloader.webpage+str(soup.find("a", class_="btn btn-danger").get("href")) 61 link_2_file = Link_to_file(title, link, size, Sdilej_downloader) 62 except Exception as e: 63 raise ValueError("Download button not found on detail page." + str(e)) 64 return link_2_file
@staticmethod
def
get_download_link_from_detail(detail_url: str) -> str:
66 @staticmethod 67 def get_download_link_from_detail(detail_url: str) -> str: 68 """ 69 Získá přímý odkaz ke stažení ze stránky s detailem souboru na sdilej.cz. 70 """ 71 page = download_page(detail_url) 72 soup = bs4.BeautifulSoup(page.text, "html.parser") 73 # Najdi tlačítko pro stažení 74 download_btn = soup.find("a", class_="btn btn-danger") 75 if not download_btn: 76 raise ValueError("Download button not found on detail page for: {}".format(detail_url)) 77 download_link = Sdilej_downloader.webpage + str(download_btn.get("href")) 78 return download_link
Získá přímý odkaz ke stažení ze stránky s detailem souboru na sdilej.cz.
@staticmethod
def
is_valid_download_page(page) -> bool:
80 @staticmethod 81 def is_valid_download_page(page) -> bool: 82 """ 83 Stránka neplatná, pokud obsahuje: 84 <h1 class="red">Stahuj a nahrávej soubory neomezenou rychlostí</h1> 85 "Tento soubor byl smazán." 86 """ 87 soup = bs4.BeautifulSoup(page.text, "html.parser") 88 invalid_texts = ( 89 "Stahuj a nahrávej soubory neomezenou rychlostí", 90 "Chyba 404 Nenalezeno", 91 "Tento soubor byl smazán." 92 ) 93 page_title = soup.find("h1", class_="red") 94 if page_title is not None and page_title.text in invalid_texts: 95 return False 96 97 soup = remove_style(soup) 98 page_txt = soup.find("div", class_="content") 99 if page_txt is not None: 100 text = remove_empty_lines(page_txt.text) 101 if any_text_coresponds_to(text, invalid_texts): 102 return False 103 return True
Stránka neplatná, pokud obsahuje:
Stahuj a nahrávej soubory neomezenou rychlostí
"Tento soubor byl smazán."
@staticmethod
def
test_downloaded_file(link_2_file, download_folder) -> bool:
105 @staticmethod 106 def test_downloaded_file(link_2_file, download_folder) -> bool: 107 file_size = os.path.getsize(f"{download_folder}/{link_2_file.title}") 108 if file_size == 0: 109 raise ValueError("ERROR: File is empty.") 110 elif link_2_file.size != None and file_size < 1024: 111 file = os.path.join(download_folder, link_2_file.title) 112 data = open(file, "r", encoding='utf-8').read() 113 return Sdilej_downloader.test_downloaded_data(data) 114 elif link_2_file.size != None and not compare_sizes(file_size, link_2_file.size, 20/100): 115 raise ValueError("ERROR: File size does not match.") 116 return True
@staticmethod
def
test_downloaded_data(data) -> bool:
118 @staticmethod 119 def test_downloaded_data(data) -> bool: 120 """ 121 Tests the downloaded data. 122 The data is invalid if a sufficient timeout has occurred. 123 If the page contains: 124 "<script>top.location.href='https://sdilej.cz/free-stahovani';</script>" 125 "<h1 class=\"red\">Stahování více souborů najednou</h1>" 126 """ 127 if data is None: 128 raise ValueError("ERROR: No data downloaded.") 129 if "<script>top.location.href='https://sdilej.cz/free-stahovani';</script>" in data: 130 raise InsufficientTimeoutError() 131 if "<h1 class=\"red\">Stahování více souborů najednou</h1>" in data: 132 raise InsufficientTimeoutError() 133 return True
Tests the downloaded data. The data is invalid if a sufficient timeout has occurred. If the page contains: "" "
Stahování více souborů najednou
"@staticmethod
def
parse_file_page(page):
135 @staticmethod 136 def parse_file_page(page): 137 if not Sdilej_downloader.is_valid_download_page(page): 138 raise ValueError("Status code: " + str(page.status_code) + ". Invalid download page: no file to download.") 139 soup = bs4.BeautifulSoup(page.text, "html.parser") 140 content = soup.find("div", class_="content") 141 content = soup.find("div", class_="col-md-12 col-sm-12 detail-leftcol") 142 return content
@staticmethod
def
parse_catalogue(page) -> Generator[src.link_to_file.Link_to_file, NoneType, NoneType]:
144 @staticmethod 145 def parse_catalogue(page) -> 'Generator[Link_to_file, None, None]': 146 """ 147 Iterates through the search results page and returns information about the files. 148 149 Yields: Link_to_file 150 """ 151 soup = bs4.BeautifulSoup(page.text, "html.parser") 152 content = soup.find("div", class_="row post") 153 if content is None: 154 return None 155 content = remove_style(content) 156 for videobox in content.find_all(class_="videobox-desc"): 157 catalogue_file = None 158 try: 159 catalogue_file = Sdilej_downloader.get_atributes_from_catalogue(videobox) 160 download_page_content = Sdilej_downloader.parse_file_page(download_page(catalogue_file.detail_url)) 161 link_2_file = Sdilej_downloader.get_atributes_from_file_page(download_page_content) 162 link_2_file.detail_url = catalogue_file.detail_url # zachovej původní detail_url! 163 yield link_2_file 164 except ValueError as e: 165 print_error(str(e) + " for file: " + (catalogue_file.title if catalogue_file else "Unknown"), False)
Iterates through the search results page and returns information about the files.
Yields: Link_to_file