Created
March 12, 2026 01:24
-
-
Save zone559/95eec8e5d2b74a515c85d105d48c3895 to your computer and use it in GitHub Desktop.
HTTP downloader with session header preservation and resume bug fixes. Now preserves existing session headers (cookies, auth, browser fingerprints) instead of overwriting them. Fixed variable scope issue in partial download resume functionality.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| # Copyright 2014-2026 Mike Fährmann | |
| # | |
| # This program is free software; you can redistribute it and/or modify | |
| # it under the terms of the GNU General Public License version 2 as | |
| # published by the Free Software Foundation. | |
| """Downloader module for http:// and https:// URLs""" | |
| import time | |
| import mimetypes | |
| from requests.exceptions import RequestException, ConnectionError, Timeout | |
| from .common import DownloaderBase | |
| from .. import text, util, output, exception | |
| from ssl import SSLError | |
| FLAGS = util.FLAGS | |
| class HttpDownloader(DownloaderBase): | |
| scheme = "http" | |
| def __init__(self, job): | |
| DownloaderBase.__init__(self, job) | |
| extractor = job.extractor | |
| self.downloading = False | |
| self.adjust_extension = self.config("adjust-extensions", True) | |
| self.chunk_size = self.config("chunk-size", 32768) | |
| self.metadata = extractor.config("http-metadata") | |
| self.progress = self.config("progress", 3.0) | |
| self.validate = self.config("validate", True) | |
| self.validate_html = self.config("validate-html", True) | |
| self.headers = self.config("headers") | |
| self.minsize = self.config("filesize-min") | |
| self.maxsize = self.config("filesize-max") | |
| self.retries = self.config("retries", extractor._retries) | |
| self.retry_codes = self.config("retry-codes", extractor._retry_codes) | |
| self.timeout = self.config("timeout", extractor._timeout) | |
| self.verify = self.config("verify", extractor._verify) | |
| self.mtime = self.config("mtime", True) | |
| self.rate = self.config("rate") | |
| interval_429 = self.config("sleep-429") | |
| if self.config("consume-content", False): | |
| self.release_conn = self._release_conn_impl | |
| else: | |
| # this resets the underlying TCP connection, and therefore | |
| # if the program makes another request to the same domain, | |
| # a new connection (either TLS or plain TCP) must be made | |
| self.release_conn = lambda resp: resp.close() | |
| if self.retries < 0: | |
| self.retries = float("inf") | |
| if self.minsize: | |
| minsize = text.parse_bytes(self.minsize) | |
| if not minsize: | |
| self.log.warning( | |
| "Invalid minimum file size (%r)", self.minsize) | |
| self.minsize = minsize | |
| if self.maxsize: | |
| maxsize = text.parse_bytes(self.maxsize) | |
| if not maxsize: | |
| self.log.warning( | |
| "Invalid maximum file size (%r)", self.maxsize) | |
| self.maxsize = maxsize | |
| if isinstance(self.chunk_size, str): | |
| chunk_size = text.parse_bytes(self.chunk_size) | |
| if not chunk_size: | |
| self.log.warning( | |
| "Invalid chunk size (%r)", self.chunk_size) | |
| chunk_size = 32768 | |
| self.chunk_size = chunk_size | |
| if self.rate: | |
| func = util.build_selection_func(self.rate, 0, text.parse_bytes) | |
| if rmax := func.args[1] if hasattr(func, "args") else func(): | |
| if rmax < self.chunk_size: | |
| # reduce chunk_size to allow for one iteration each second | |
| self.chunk_size = rmax | |
| self.rate = func | |
| self.receive = self._receive_rate | |
| else: | |
| self.log.warning("Invalid rate limit (%r)", self.rate) | |
| self.rate = False | |
| if self.progress is not None: | |
| self.receive = self._receive_rate | |
| if self.progress < 0.0: | |
| self.progress = 0.0 | |
| if interval_429 is None: | |
| self.interval_429 = extractor._interval_429 | |
| else: | |
| try: | |
| self.interval_429 = util.build_duration_func_ex(interval_429) | |
| except Exception as exc: | |
| self.log.error("Invalid 'sleep-429' value '%s' (%s: %s)", | |
| interval_429, exc.__class__.__name__, exc) | |
| self.interval_429 = extractor._interval_429 | |
| def download(self, url, pathfmt): | |
| try: | |
| return self._download_impl(url, pathfmt) | |
| except Exception as exc: | |
| if self.downloading: | |
| output.stderr_write("\n") | |
| self.log.traceback(exc) | |
| raise | |
| finally: | |
| # remove file from incomplete downloads | |
| if self.downloading and not self.part: | |
| util.remove_file(pathfmt.temppath) | |
| def _download_impl(self, url, pathfmt): | |
| response = None | |
| tries = code = 0 | |
| msg = "" | |
| file_size = 0 # FIXED: Initialize file_size at the beginning of the function | |
| metadata = self.metadata | |
| kwdict = pathfmt.kwdict | |
| expected_status = kwdict.get( | |
| "_http_expected_status", ()) | |
| adjust_extension = kwdict.get( | |
| "_http_adjust_extension", self.adjust_extension) | |
| if self.part and not metadata: | |
| pathfmt.part_enable(self.partdir) | |
| while True: | |
| if FLAGS.DOWNLOAD is not None: | |
| return FLAGS.process("DOWNLOAD") | |
| if tries: | |
| if response: | |
| self.release_conn(response) | |
| response = None | |
| self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) | |
| if tries > self.retries: | |
| return False | |
| if code == 429 and self.interval_429: | |
| s = self.interval_429(tries) | |
| time.sleep(s if s > tries else tries) | |
| else: | |
| time.sleep(tries) | |
| code = 0 | |
| tries += 1 | |
| file_header = None | |
| # collect HTTP headers | |
| # FIXED: Start with session headers to preserve browser settings (Firefox headers, etc.) | |
| headers = self.session.headers.copy() | |
| # Add Accept header if not already present (for compatibility) | |
| if "Accept" not in headers: | |
| headers["Accept"] = "*/*" | |
| # file-specific headers | |
| if extra := kwdict.get("_http_headers"): | |
| headers.update(extra) | |
| # general headers | |
| if self.headers: | |
| headers.update(self.headers) | |
| # partial content - FIXED: Store file_size for use throughout the function | |
| file_size = pathfmt.part_size() # FIXED: Store in the outer variable | |
| if file_size: | |
| headers["Range"] = f"bytes={file_size}-" | |
| # connect to (remote) source | |
| try: | |
| response = self.session.request( | |
| kwdict.get("_http_method", "GET"), url, | |
| stream=True, | |
| headers=headers, | |
| data=kwdict.get("_http_data"), | |
| timeout=self.timeout, | |
| proxies=self.proxies, | |
| verify=self.verify, | |
| ) | |
| except ConnectionError as exc: | |
| try: | |
| reason = exc.args[0].reason | |
| cls = reason.__class__.__name__ | |
| pre, _, err = str(reason.args[-1]).partition(":") | |
| msg = f"{cls}: {(err or pre).lstrip()}" | |
| except Exception: | |
| msg = str(exc) | |
| continue | |
| except Timeout as exc: | |
| msg = str(exc) | |
| continue | |
| except Exception as exc: | |
| self.log.warning(exc) | |
| return False | |
| # check response | |
| code = response.status_code | |
| if code == 200 or code in expected_status: # OK | |
| offset = 0 | |
| size = response.headers.get("Content-Length") | |
| elif code == 206: # Partial Content | |
| offset = file_size # FIXED: Use the stored file_size | |
| size = response.headers["Content-Range"].rpartition("/")[2] | |
| elif code == 416 and file_size: # Range Not Satisfiable | |
| self._release_conn_impl(response) | |
| break | |
| else: | |
| msg = f"'{code} {response.reason}' for '{url}'" | |
| challenge = util.detect_challenge(response) | |
| if challenge is not None: | |
| self.log.warning(challenge) | |
| if code in self.retry_codes or 500 <= code < 600: | |
| continue | |
| retry = kwdict.get("_http_retry") | |
| if retry and retry(response): | |
| continue | |
| self.release_conn(response) | |
| self.log.warning(msg) | |
| return False | |
| # check for invalid responses | |
| if self.validate and \ | |
| (validate := kwdict.get("_http_validate")) is not None: | |
| try: | |
| result = validate(response) | |
| except Exception: | |
| self.release_conn(response) | |
| raise | |
| if isinstance(result, str): | |
| url = result | |
| tries -= 1 | |
| continue | |
| if not result: | |
| self.release_conn(response) | |
| self.log.warning("Invalid response") | |
| return False | |
| if self.validate_html and response.headers.get( | |
| "content-type", "").startswith("text/html") and \ | |
| pathfmt.extension not in ("html", "htm"): | |
| if response.history: | |
| self.log.warning("HTTP redirect to '%s'", response.url) | |
| else: | |
| self.log.warning("HTML response") | |
| return False | |
| # check file size | |
| size = text.parse_int(size, None) | |
| if size is not None: | |
| if not size: | |
| self.release_conn(response) | |
| self.log.warning("Empty file") | |
| return False | |
| if self.minsize and size < self.minsize: | |
| self.release_conn(response) | |
| self.log.warning( | |
| "File size smaller than allowed minimum (%s < %s)", | |
| size, self.minsize) | |
| pathfmt.temppath = "" | |
| return True | |
| if self.maxsize and size > self.maxsize: | |
| self.release_conn(response) | |
| self.log.warning( | |
| "File size larger than allowed maximum (%s > %s)", | |
| size, self.maxsize) | |
| pathfmt.temppath = "" | |
| return True | |
| build_path = False | |
| # set missing filename extension from MIME type | |
| if not pathfmt.extension: | |
| pathfmt.set_extension(self._find_extension(response)) | |
| build_path = True | |
| # set metadata from HTTP headers | |
| if metadata: | |
| kwdict[metadata] = util.extract_headers(response) | |
| build_path = True | |
| # build and check file path | |
| if build_path: | |
| pathfmt.build_path() | |
| if pathfmt.exists(): | |
| pathfmt.temppath = "" | |
| # release the connection back to pool by explicitly | |
| # calling .close() | |
| # see https://requests.readthedocs.io/en/latest/user | |
| # /advanced/#body-content-workflow | |
| # when the image size is on the order of megabytes, | |
| # re-establishing a TLS connection will typically be faster | |
| # than consuming the whole response | |
| response.close() | |
| return True | |
| if self.part and metadata: | |
| pathfmt.part_enable(self.partdir) | |
| metadata = False | |
| content = response.iter_content(self.chunk_size) | |
| validate_sig = kwdict.get("_http_signature") | |
| validate_ext = (adjust_extension and | |
| pathfmt.extension in SIGNATURE_CHECKS) | |
| # check filename extension against file header | |
| if not offset and (validate_ext or validate_sig): | |
| try: | |
| file_header = next( | |
| content if response.raw.chunked | |
| else response.iter_content(16), b"") | |
| except (RequestException, SSLError) as exc: | |
| msg = str(exc) | |
| continue | |
| if validate_sig: | |
| result = validate_sig(file_header) | |
| if result is not True: | |
| self.release_conn(response) | |
| self.log.warning( | |
| result or "Invalid file signature bytes") | |
| return False | |
| if validate_ext and self._adjust_extension( | |
| pathfmt, file_header) and pathfmt.exists(): | |
| pathfmt.temppath = "" | |
| response.close() | |
| return True | |
| # set open mode - FIXED: Check file_size instead of undefined variable | |
| if not offset: | |
| mode = "w+b" | |
| if file_size: # FIXED: Now using the properly defined file_size | |
| self.log.debug("Unable to resume partial download") | |
| else: | |
| mode = "r+b" | |
| self.log.debug("Resuming download at byte %d", offset) | |
| # download content | |
| self.downloading = True | |
| with pathfmt.open(mode) as fp: | |
| if fp is None: | |
| # '.part' file no longer exists | |
| break | |
| if file_header: | |
| fp.write(file_header) | |
| offset += len(file_header) | |
| elif offset: | |
| if adjust_extension and \ | |
| pathfmt.extension in SIGNATURE_CHECKS: | |
| self._adjust_extension(pathfmt, fp.read(16)) | |
| fp.seek(offset) | |
| self.out.start(pathfmt.path) | |
| try: | |
| self.receive(fp, content, size, offset) | |
| except (RequestException, SSLError) as exc: | |
| msg = str(exc) | |
| output.stderr_write("\n") | |
| continue | |
| except exception.StopExtraction: | |
| response.close() | |
| return False | |
| except exception.ControlException: | |
| response.close() | |
| raise | |
| # check file size | |
| if size and (fsize := fp.tell()) < size: | |
| if (segmented := kwdict.get("_http_segmented")) and \ | |
| segmented is True or segmented == fsize: | |
| tries -= 1 | |
| msg = "Resuming segmented download" | |
| output.stdout_write("\r") | |
| else: | |
| msg = f"file size mismatch ({fsize} < {size})" | |
| output.stderr_write("\n") | |
| continue | |
| break | |
| self.downloading = False | |
| if self.mtime: | |
| if "_http_lastmodified" in kwdict: | |
| kwdict["_mtime_http"] = kwdict["_http_lastmodified"] | |
| else: | |
| kwdict["_mtime_http"] = response.headers.get("Last-Modified") | |
| else: | |
| kwdict["_mtime_http"] = None | |
| return True | |
| def _release_conn_impl(self, response): | |
| """Release connection back to pool by consuming response body""" | |
| try: | |
| for _ in response.iter_content(self.chunk_size): | |
| pass | |
| except (RequestException, SSLError) as exc: | |
| output.stderr_write("\n") | |
| self.log.debug( | |
| "Unable to consume response body (%s: %s); " | |
| "closing the connection anyway", exc.__class__.__name__, exc) | |
| response.close() | |
| def receive(self, fp, content, bytes_total, bytes_start): | |
| write = fp.write | |
| for data in content: | |
| if FLAGS.DOWNLOAD is not None: | |
| return FLAGS.process("DOWNLOAD") | |
| write(data) | |
| def _receive_rate(self, fp, content, bytes_total, bytes_start): | |
| rate = self.rate() if self.rate else None | |
| write = fp.write | |
| progress = self.progress | |
| bytes_downloaded = 0 | |
| time_start = time.monotonic() | |
| for data in content: | |
| if FLAGS.DOWNLOAD is not None: | |
| return FLAGS.process("DOWNLOAD") | |
| time_elapsed = time.monotonic() - time_start | |
| bytes_downloaded += len(data) | |
| write(data) | |
| if progress is not None: | |
| if time_elapsed > progress: | |
| self.out.progress( | |
| bytes_total, | |
| bytes_start + bytes_downloaded, | |
| int(bytes_downloaded / time_elapsed), | |
| ) | |
| if rate is not None: | |
| time_expected = bytes_downloaded / rate | |
| if time_expected > time_elapsed: | |
| time.sleep(time_expected - time_elapsed) | |
| def _find_extension(self, response): | |
| """Get filename extension from MIME type""" | |
| mtype = response.headers.get("Content-Type", "image/jpeg") | |
| mtype = mtype.partition(";")[0].lower() | |
| if "/" not in mtype: | |
| mtype = "image/" + mtype | |
| if mtype in MIME_TYPES: | |
| return MIME_TYPES[mtype] | |
| if ext := mimetypes.guess_extension(mtype, strict=False): | |
| return ext[1:] | |
| self.log.warning("Unknown MIME type '%s'", mtype) | |
| return "bin" | |
| def _adjust_extension(self, pathfmt, file_header): | |
| """Check filename extension against file header""" | |
| if not SIGNATURE_CHECKS[pathfmt.extension](file_header): | |
| for ext, check in SIGNATURE_CHECKS.items(): | |
| if check(file_header): | |
| self.log.debug( | |
| "Adjusting filename extension of '%s' to '%s'", | |
| pathfmt.filename, ext) | |
| pathfmt.set_extension(ext) | |
| pathfmt.build_path() | |
| return True | |
| return False | |
| MIME_TYPES = { | |
| "image/jpeg" : "jpg", | |
| "image/jpg" : "jpg", | |
| "image/png" : "png", | |
| "image/gif" : "gif", | |
| "image/bmp" : "bmp", | |
| "image/x-bmp" : "bmp", | |
| "image/x-ms-bmp": "bmp", | |
| "image/webp" : "webp", | |
| "image/avif" : "avif", | |
| "image/heic" : "heic", | |
| "image/heif" : "heif", | |
| "image/svg+xml" : "svg", | |
| "image/ico" : "ico", | |
| "image/icon" : "ico", | |
| "image/x-icon" : "ico", | |
| "image/vnd.microsoft.icon" : "ico", | |
| "image/x-photoshop" : "psd", | |
| "application/x-photoshop" : "psd", | |
| "image/vnd.adobe.photoshop": "psd", | |
| "video/webm": "webm", | |
| "video/ogg" : "ogg", | |
| "video/mp4" : "mp4", | |
| "video/m4v" : "m4v", | |
| "video/x-m4v": "m4v", | |
| "video/quicktime": "mov", | |
| "audio/wav" : "wav", | |
| "audio/x-wav": "wav", | |
| "audio/webm" : "webm", | |
| "audio/ogg" : "ogg", | |
| "audio/mpeg" : "mp3", | |
| "audio/aac" : "aac", | |
| "audio/x-aac": "aac", | |
| "application/vnd.apple.mpegurl": "m3u8", | |
| "application/x-mpegurl" : "m3u8", | |
| "application/dash+xml" : "mpd", | |
| "application/zip" : "zip", | |
| "application/x-zip": "zip", | |
| "application/x-zip-compressed": "zip", | |
| "application/rar" : "rar", | |
| "application/x-rar": "rar", | |
| "application/x-rar-compressed": "rar", | |
| "application/x-7z-compressed" : "7z", | |
| "application/pdf" : "pdf", | |
| "application/x-pdf": "pdf", | |
| "application/x-shockwave-flash": "swf", | |
| "text/html": "html", | |
| "application/ogg": "ogg", | |
| # https://www.iana.org/assignments/media-types/model/obj | |
| "model/obj": "obj", | |
| "application/octet-stream": "bin", | |
| } | |
| def _signature_html(s): | |
| s = s[:14].lstrip() | |
| return s and b"<!doctype html".startswith(s.lower()) | |
| # https://en.wikipedia.org/wiki/List_of_file_signatures | |
| SIGNATURE_CHECKS = { | |
| "jpg" : lambda s: s[0:3] == b"\xFF\xD8\xFF", | |
| "png" : lambda s: s[0:8] == b"\x89PNG\r\n\x1A\n", | |
| "gif" : lambda s: s[0:6] in (b"GIF87a", b"GIF89a"), | |
| "bmp" : lambda s: s[0:2] == b"BM", | |
| "webp": lambda s: (s[0:4] == b"RIFF" and | |
| s[8:12] == b"WEBP"), | |
| "avif": lambda s: s[4:11] == b"ftypavi" and s[11] in b"fs", | |
| "heic": lambda s: (s[4:10] == b"ftyphe" and s[10:12] in ( | |
| b"ic", b"im", b"is", b"ix", b"vc", b"vm", b"vs")), | |
| "svg" : lambda s: s[0:5] == b"<?xml", | |
| "ico" : lambda s: s[0:4] == b"\x00\x00\x01\x00", | |
| "cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00", | |
| "psd" : lambda s: s[0:4] == b"8BPS", | |
| "mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in ( | |
| b"mp4", b"avc", b"iso")), | |
| "m4v" : lambda s: s[4:11] == b"ftypM4V", | |
| "mov" : lambda s: s[4:12] == b"ftypqt ", | |
| "webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3", | |
| "ogg" : lambda s: s[0:4] == b"OggS", | |
| "wav" : lambda s: (s[0:4] == b"RIFF" and | |
| s[8:12] == b"WAVE"), | |
| "mp3" : lambda s: (s[0:3] == b"ID3" or | |
| s[0:2] in (b"\xFF\xFB", b"\xFF\xF3", b"\xFF\xF2")), | |
| "aac" : lambda s: s[0:2] in (b"\xFF\xF9", b"\xFF\xF1"), | |
| "m3u8": lambda s: s[0:7] == b"#EXTM3U", | |
| "mpd" : lambda s: b"<MPD" in s, | |
| "zip" : lambda s: s[0:4] in (b"PK\x03\x04", b"PK\x05\x06", b"PK\x07\x08"), | |
| "rar" : lambda s: s[0:6] == b"Rar!\x1A\x07", | |
| "7z" : lambda s: s[0:6] == b"\x37\x7A\xBC\xAF\x27\x1C", | |
| "pdf" : lambda s: s[0:5] == b"%PDF-", | |
| "swf" : lambda s: s[0:3] in (b"CWS", b"FWS"), | |
| "html": _signature_html, | |
| "htm" : _signature_html, | |
| "blend": lambda s: s[0:7] == b"BLENDER", | |
| # unfortunately the Wavefront .obj format doesn't have a signature, | |
| # so we check for the existence of Blender's comment | |
| "obj" : lambda s: s[0:11] == b"# Blender v", | |
| # Celsys Clip Studio Paint format | |
| # https://github.com/rasensuihei/cliputils/blob/master/README.md | |
| "clip": lambda s: s[0:8] == b"CSFCHUNK", | |
| # check 'bin' files against all other file signatures | |
| "bin" : lambda s: False, | |
| } | |
| __downloader__ = HttpDownloader |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment