# Author: peterfarge@pfarge.de                                                    Date: 07. June 2026
# Description: Download image collections from threads on Vipergirls.to using Scrapy, provided the
#              host is IMX.to. For more detailed instructions, visit www.pfarge.de/vipergirls_imx.html
# Developed with Python 3.13.5 and Scrapy 2.16.0

import re
import argparse
import scrapy
from scrapy import signals
from typing import cast
from pathlib import Path
from scrapy.crawler import CrawlerProcess


class VipergirlsSpider(scrapy.Spider):
    name = "vipergirls"
    DEFAULT_DOWNLOAD_FOLDER = "downloads"

    custom_settings = {
        "DOWNLOAD_DELAY": 1,
        "RANDOMIZE_DOWNLOAD_DELAY": True,
        "CONCURRENT_REQUESTS": 2,
        "CONCURRENT_REQUESTS_PER_DOMAIN": 1,
        "RETRY_TIMES": 10,
        "COOKIES_ENABLED": True,
        "AUTOTHROTTLE_ENABLED": True,
        "AUTOTHROTTLE_START_DELAY": 1,
        "AUTOTHROTTLE_MAX_DELAY": 10,
        #"LOG_LEVEL": "INFO",
        #"LOG_LEVEL": "DEBUG",
        "LOG_LEVEL": "WARNING",
        "REDIRECT_ENABLED" : True,
        #"HTTPERROR_ALLOW_ALL": True, # The client should NOT raise an exception for non-2xx HTTP status codes Instead, the full response is returned to the caller to handle manually.
        "USER_AGENT": (
            "Mozilla/5.0 "
            "(Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 "
            "(KHTML, like Gecko) "
            "Chrome/125.0 Safari/537.36"
        ),
    }

    def __init__(self, thread_url, start_post, end_post, description_lines, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.start_urls = [thread_url]
        self.start_post = int(start_post)
        self.end_post = int(end_post)
        self.description_lines = int(description_lines)

    def parse(self, response):
        print(f"Parse Url: {response.url}")
        posts = response.css("li.postbitlegacy[id^='post_']")
        abort_nextpage_parsing=False

        for i, post in enumerate(posts):
            post_number_text = post.css("span.nodecontrols a.postcounter::text").get()
            if not post_number_text:
                continue

            post_number_text = (post_number_text.strip())
            if not post_number_text.startswith("#"):
                continue

            try:
                global_post_number = int(post_number_text.replace("#", ""))
            except ValueError:
                continue

            # Only postings within the range of the caller arguments
            if global_post_number < self.start_post:
                continue
            if  global_post_number > self.end_post:
                abort_nextpage_parsing=True
                break

            # One of the desired postings found
            print(f"Processing posting: #{global_post_number}")
            raw_lines = post.css("blockquote.postcontent *::text").getall()
            text_lines = []

            for line in raw_lines:
                line = line.strip()
                if not line:
                    continue
                text_lines.append(line)

            # Collect IMX links in posting (sometimes direct links)
            image_pages = []
            direct_links = []
            for href in post.css("blockquote.postcontent a::attr(href)").getall():
                href = response.urljoin(href)
                if "imx.to/i/" in href:
                    image_pages.append(href)
                elif "imx.to/u/i/" in href:
                    direct_links.append(href)

            # Foldername: First line + PostingNbr + PictureCount
            if text_lines:
                folder_name = ""
                for j in range(self.description_lines):
                    folder_name += f"{text_lines[j]} "
            else:
                folder_name = f"Posting "
            folder_name += f"(#{global_post_number:04d}, {len(image_pages) + len(direct_links)} images)"
            folder_name = (self.sanitize_filename(folder_name))
            target_dir = (Path(VipergirlsSpider.DEFAULT_DOWNLOAD_FOLDER) / folder_name)
            if not target_dir.exists():
                print(f"Creating folder: {folder_name}")
            target_dir.mkdir(parents=True, exist_ok=True)

            # Convert direct links into IMX links (lookup real filename later on)
            for image_page_url in direct_links:
                url = "https://imx.to/i/" + Path(image_page_url).stem
                image_pages.append(url)

            # Yield every IMX picture
            for k, image_page_url in enumerate(image_pages):
                # https://imx.to/i/2xovwl
                if "imx.to/i/" in image_page_url:
                    yield scrapy.Request(image_page_url,
                        callback=self.parse_imx_page,
                        priority=1000000 - global_post_number*200 - k,
                        meta={"target_dir": str(target_dir),
                              "progress": f"{k+1:03d}/{len(image_pages)}"})
                else:
                    print("ERROR: No IMX image")

        # Parse next thread page
        next_page = response.css("a[rel='next']::attr(href)").get()
        if not abort_nextpage_parsing and next_page:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)


    def parse_imx_page(self, response):
        #print("POST", response.url, response.headers.get("Set-Cookie"))

        # Press my own continue button
        yield scrapy.Request(
            url=response.url,
            method="POST",
            priority=response.request.priority,
            dont_filter=True,
            headers={"Content-Type": "application/x-www-form-urlencoded", "Referer": response.url},
            body="imgContinue=Continue+to+your+image...",
            callback=self.parse_real_image_page,
            errback=self.request_failed,
            meta={"target_dir" : response.meta["target_dir"],
                  "progress" : response.meta["progress"]}
        )


    @staticmethod
    def request_failed(failure):
        print(f"REQUEST FAILED: {repr(failure)}")


    def parse_real_image_page(self, response):
        target_dir = response.meta["target_dir"]
        filename = response.css("img.centred::attr(alt)").get()
        filepath = (Path(target_dir) / filename)
        if filepath.exists():
            print(f"File exits: {str(filepath)[len(VipergirlsSpider.DEFAULT_DOWNLOAD_FOLDER)+1:]}")
            return

        image_url = response.css("img.centred::attr(src)").get()
        if image_url is not None:
            yield scrapy.Request(
                image_url,
                callback=self.save_image,
                dont_filter=True,
                priority=response.request.priority,
                headers={
                    "Referer": response.url,
                    "User-Agent": (
                        "Mozilla/5.0 "
                        "(Windows NT 10.0; "
                        "Win64; x64) "
                        "AppleWebKit/537.36 "
                        "(KHTML, like Gecko) "
                        "Chrome/125.0 "
                        "Safari/537.36"
                    ),
                    "Accept": (
                        "image/avif,"
                        "image/webp,"
                        "image/apng,"
                        "image/svg+xml,"
                        "image/*,*/*;q=0.8"
                    ),
                    "Accept-Language": "en-US,en;q=0.9",
                    "Connection": "keep-alive",
                }, meta={"filepath": filepath,
                         "progress" : response.meta["progress"]}
            )


    @staticmethod
    def save_image(response):
        filepath = response.meta["filepath"]
        progress = response.meta["progress"]

        while True:
            try:
                with open(filepath, "wb") as f:
                    f.write(response.body)

                    output_path = str(filepath)[len(VipergirlsSpider.DEFAULT_DOWNLOAD_FOLDER)+1:]
                    if len(output_path) > 80:
                        output_path = output_path[:20] + " [...] " + output_path[-60:]

                    print(f"{progress} files saved: {output_path}")
                    return

            except OSError as e:
                if e.errno == 28:
                    print()
                    print("=" * 80)
                    print("Error: No space left")
                    print()
                    print(
                        f"File could not be saved:\n"
                        f"{filepath}"
                    )
                    print()
                    print(
                        "Please free some space, then press <Return>."
                    )
                    print("=" * 80)
                    print()
                    print("\a")
                    input()
                    continue

                raise


    @staticmethod
    def sanitize_filename(name):
        name = re.sub( r'[<>:"/\\|?*]', "_", name)
        name = re.sub( r"\s+", " ",   name )
        return name.strip()[:120]


    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = cast(VipergirlsSpider, super().from_crawler(crawler, *args, **kwargs))
        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
        return spider


    def spider_closed(self, spider):
        if self.crawler.stats:
            print("dupefilter/filtered =", self.crawler.stats.get_value("dupefilter/filtered", 0))

        stats = self.crawler.stats.get_stats()
        print("ENQUEUED :", stats.get("scheduler/enqueued"))
        print("DEQUEUED :", stats.get("scheduler/dequeued"))

        print("GET      :", stats.get("downloader/request_method_count/GET"))
        print("POST     :", stats.get("downloader/request_method_count/POST"))

        print("RESP200  :", stats.get("downloader/response_status_count/200"))
        print("RESP302  :", stats.get("downloader/response_status_count/302"))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--thread-url",
        type=str,
        required=True,
        help="Url of the thread/page"
    )

    parser.add_argument(
        "--start-post",
        type=int,
        required=False,
        default=1,
        help="Start posting number"
    )

    parser.add_argument(
        "--end-post",
        type=int,
        required=False,
        default=9999999,
        help="End posting number"
    )

    parser.add_argument(
        "--description_lines",
        type=int,
        required=False,
        default=1,
        help="Number of lines at the starting of each posting, used to create the dowload folder"
    )

    args = parser.parse_args()
    process = CrawlerProcess()
    process.crawl(
        VipergirlsSpider,
        thread_url=args.thread_url,
        start_post=args.start_post,
        end_post=args.end_post,
        description_lines=args.description_lines
    )

    process.start()
    # Beep at the end
    print("\a")