From a4f40f091aa825d3a7f996c00a5404d9a3690ef3 Mon Sep 17 00:00:00 2001 From: hygienic-books Date: Mon, 28 Feb 2022 04:21:18 +0100 Subject: [PATCH] Initial commit of heiseselect-dl script --- main.py | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..ba15b89 --- /dev/null +++ b/main.py @@ -0,0 +1,79 @@ +import sys +import time +import requests +import logging +from rich.logging import RichHandler + + +FORMAT = "%(message)s" +# We're defaulting to log level WARNING for all modules +logging.basicConfig( + level="WARNING", format=FORMAT, datefmt="[%X]", handlers=[RichHandler()] +) +log = logging.getLogger("rich") +# Our own code gets its own log level, that's probably what we want to play with most +log.setLevel(logging.INFO) + + +# If we need to manually adjust log level for a specific module here's how you get a list of all modules that +# themselves use the 'logging' module. See also https://stackoverflow.com/a/36208664. +#for key in logging.Logger.manager.loggerDict: +# print(key) + + +username = +password = +dl_dir = +dl_dir = dl_dir.rstrip("\\") +file_name_on_disk = r"""2021-5.pdf""" +dl_url = r"""https://www.heise.de/select/ct/archiv/2021/5/download""" +dl_wait_timeout = 60 + + +def logout(logged_in_session: requests.Session) -> None: + log.debug(f"Logging out ...") + with logged_in_session.get("https://www.heise.de/sso/login/logout") as logout_request: + log.debug(f"Logout got HTTP status code {logout_request.status_code}") + + +def login(login_username: str, login_password: str) -> requests.Session: + log.debug(f"Logging in ...") + session = requests.Session() + payload = { + "username": login_username, + "password": login_password + } + response = session.post("https://www.heise.de/sso/login/login", data=payload, allow_redirects=False) + if "Set-Cookie" in response.headers: + log.debug(f"Logged in, cookie received") + return session + + +def download_mag(**kwargs: requests.Session) -> None: + log.info(f"Preparing heise.de session ...") + have_printed_wait_notice = False + logged_in_session = kwargs.get("logged_in_session") + if not logged_in_session: + logged_in_session = login(username, password) + started_waiting = time.time() + log.info(f"Requesting AWS download ...") + while True: + if time.time() - started_waiting > dl_wait_timeout: + log.warning(f"We've waited {dl_wait_timeout}s for download. Exiting 1 ...") + sys.exit(1) + if int(logged_in_session.get(dl_url).headers['Content-Length']) > 1000000: + break + if not have_printed_wait_notice: log.debug(f"Waiting for heise's AWS backend ...") + have_printed_wait_notice = True + time.sleep(3) + log.info(f"Downloading ...") + with logged_in_session.get(dl_url) as dl_request: + with open(dl_dir + r"\\" + file_name_on_disk, "wb") as file: + file.write(dl_request.content) + log.info(f"Download complete") + logout(logged_in_session) + + +download_mag() +log.debug("Done, exiting 0 ...") +sys.exit(0) \ No newline at end of file