init
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/output/
|
||||
23
Dockerfile
Normal file
23
Dockerfile
Normal file
@@ -0,0 +1,23 @@
|
||||
FROM mcr.microsoft.com/playwright/python:v1.44.0-jammy
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Зависимости
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Устанавливаем только Chromium (остальные браузеры не нужны)
|
||||
RUN playwright install chromium --with-deps
|
||||
|
||||
COPY src/ ./src/
|
||||
COPY debug_site.py ./debug_site.py
|
||||
COPY debug_cdn.py ./debug_cdn.py
|
||||
|
||||
# Выходные данные и состояние монтируются снаружи
|
||||
VOLUME ["/app/output", "/app/state"]
|
||||
|
||||
ENTRYPOINT ["python", "-m", "src.cli"]
|
||||
CMD ["--help"]
|
||||
|
||||
|
||||
|
||||
86
README.md
Normal file
86
README.md
Normal file
@@ -0,0 +1,86 @@
|
||||
# Manga Downloader
|
||||
|
||||
Загрузчик манги с readmanga.ru. Использует Playwright + Chromium для обхода JS-защиты сайта.
|
||||
|
||||
## Требования
|
||||
|
||||
- Docker + Docker Compose
|
||||
|
||||
## Быстрый старт
|
||||
|
||||
### 1. Собрать образ
|
||||
|
||||
```bash
|
||||
docker compose build
|
||||
```
|
||||
|
||||
### 2. Анализировать мангу (проверить доступность, список глав)
|
||||
|
||||
```bash
|
||||
docker compose run --rm manga analyze https://3.readmanga.ru/magicheskaia_bitva
|
||||
```
|
||||
|
||||
### 3. Скачать всю мангу
|
||||
|
||||
```bash
|
||||
# CBZ (по умолчанию)
|
||||
docker compose run --rm manga download https://3.readmanga.ru/magicheskaia_bitva
|
||||
|
||||
# PDF
|
||||
docker compose run --rm manga download https://3.readmanga.ru/magicheskaia_bitva --format pdf
|
||||
|
||||
# Все форматы сразу
|
||||
docker compose run --rm manga download https://3.readmanga.ru/magicheskaia_bitva --format all
|
||||
|
||||
# EPUB
|
||||
docker compose run --rm manga download https://3.readmanga.ru/magicheskaia_bitva --format epub
|
||||
```
|
||||
|
||||
### 4. Скачать определённые главы
|
||||
|
||||
```bash
|
||||
# Главы с 1 по 10
|
||||
docker compose run --rm manga download <URL> --chapters 1-10
|
||||
|
||||
# Конкретные главы
|
||||
docker compose run --rm manga download <URL> --chapters 1,5,10
|
||||
|
||||
# Одна глава
|
||||
docker compose run --rm manga download <URL> --chapters 47
|
||||
```
|
||||
|
||||
### 5. Продолжить прерванное скачивание
|
||||
|
||||
Скачивание автоматически продолжается с того места, где остановилось (флаг `--resume` включён по умолчанию).
|
||||
|
||||
```bash
|
||||
docker compose run --rm manga download <URL> --resume
|
||||
```
|
||||
|
||||
## Выходные файлы
|
||||
|
||||
Файлы сохраняются в `./output/<название манги>/`:
|
||||
|
||||
```
|
||||
output/
|
||||
Магическая_битва/
|
||||
v01_ch001.0.cbz
|
||||
v01_ch002.0.cbz
|
||||
...
|
||||
```
|
||||
|
||||
## Прогресс
|
||||
|
||||
Состояние хранится в `./state/progress.db` (SQLite). Логи — в `./state/manga.log`.
|
||||
|
||||
## Дополнительные опции
|
||||
|
||||
```
|
||||
--format / -f cbz | pdf | epub | all (по умолчанию: cbz)
|
||||
--chapters / -c Диапазон или список глав
|
||||
--output / -o Папка для сохранения (по умолчанию: ./output)
|
||||
--resume Пропускать скачанные главы (по умолчанию: включено)
|
||||
--concurrency Параллельных загрузок (по умолчанию: 4)
|
||||
--verbose / -v Подробный вывод
|
||||
```
|
||||
|
||||
63
debug_cdn.py
Normal file
63
debug_cdn.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""
|
||||
Анализ HTTP 300-редиректов CDN one-way.work
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, '/app')
|
||||
from src.browser import BrowserManager
|
||||
from src.scraper import _extract_images_from_js, _navigate_with_retry
|
||||
|
||||
|
||||
async def main():
|
||||
url = "https://3.readmanga.ru/magicheskaia_bitva/vol1/1?mtr=1"
|
||||
referer = "https://3.readmanga.ru/magicheskaia_bitva"
|
||||
|
||||
async with BrowserManager(headless=True) as bm:
|
||||
ctx, page = await bm.new_page()
|
||||
|
||||
all_responses = {}
|
||||
|
||||
async def on_response(r):
|
||||
if "one-way.work" in r.url or "staticfa" in r.url:
|
||||
all_responses[r.url] = {
|
||||
"status": r.status,
|
||||
"headers": dict(r.headers),
|
||||
}
|
||||
|
||||
page.on("response", on_response)
|
||||
|
||||
await _navigate_with_retry(page, url, referer=referer)
|
||||
await asyncio.sleep(3)
|
||||
|
||||
imgs = await _extract_images_from_js(page)
|
||||
print(f"Изображений в readerInit: {len(imgs)}")
|
||||
print(f"Перехвачено ответов: {len(all_responses)}")
|
||||
|
||||
# Покажем первые несколько — статус и заголовки
|
||||
for img_url, data in list(all_responses.items())[:5]:
|
||||
print(f"\n--- {img_url[:80]} ---")
|
||||
print(f" Status: {data['status']}")
|
||||
for k, v in data['headers'].items():
|
||||
if k.lower() in ('location', 'content-type', 'content-length', 'x-redirect', 'x-accel'):
|
||||
print(f" {k}: {v}")
|
||||
|
||||
# Теперь попробуем прямой запрос через playwright request
|
||||
print("\n\n=== Прямой запрос через context.request ===")
|
||||
if imgs:
|
||||
test_url = imgs[5]
|
||||
print(f"URL: {test_url}")
|
||||
try:
|
||||
resp = await ctx.request.get(test_url, headers={"Referer": referer})
|
||||
print(f"Status: {resp.status}")
|
||||
print(f"Headers: {dict(resp.headers)}")
|
||||
body = await resp.body()
|
||||
print(f"Body size: {len(body)} bytes")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
await ctx.close()
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
67
debug_site.py
Normal file
67
debug_site.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""
|
||||
Диагностический скрипт: снимает скриншот страницы и сохраняет HTML.
|
||||
Аргументы:
|
||||
1: URL страницы (обязательный)
|
||||
2: Referer (опциональный, по умолчанию — домен из URL)
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, '/app')
|
||||
from src.browser import BrowserManager
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
async def main(url: str, referer: str | None = None):
|
||||
async with BrowserManager(headless=True) as bm:
|
||||
ctx, page = await bm.new_page()
|
||||
|
||||
# Слушаем все ответы
|
||||
async def on_response(r):
|
||||
print(f" [{r.status}] {r.url[:120]}")
|
||||
|
||||
page.on("response", on_response)
|
||||
|
||||
p = urlparse(url)
|
||||
if referer is None:
|
||||
referer = f"{p.scheme}://{p.netloc}/"
|
||||
|
||||
print(f"\n=== Открываем: {url} ===")
|
||||
print(f"=== Referer: {referer} ===\n")
|
||||
|
||||
try:
|
||||
resp = await page.goto(url, wait_until="domcontentloaded",
|
||||
timeout=30_000, referer=referer)
|
||||
print(f"\nСтатус ответа: {resp.status if resp else 'нет'}")
|
||||
print(f"Финальный URL: {page.url}")
|
||||
except Exception as e:
|
||||
print(f"Ошибка goto: {e}")
|
||||
|
||||
# Ждём немного JS
|
||||
try:
|
||||
await page.wait_for_load_state("networkidle", timeout=15_000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Скриншот
|
||||
out = Path("/app/output/debug_screenshot.png")
|
||||
out.parent.mkdir(exist_ok=True)
|
||||
await page.screenshot(path=str(out), full_page=False)
|
||||
print(f"\nСкриншот сохранён: {out}")
|
||||
print(f"Title: {await page.title()}")
|
||||
|
||||
# Сохраняем HTML
|
||||
html = await page.content()
|
||||
html_out = Path("/app/output/debug_page.html")
|
||||
html_out.write_text(html)
|
||||
print(f"HTML сохранён: {html_out} ({len(html)} байт)")
|
||||
print(f"\n=== HTML (первые 3000 символов) ===")
|
||||
print(html[:3000])
|
||||
|
||||
await ctx.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
url = sys.argv[1] if len(sys.argv) > 1 else "https://3.readmanga.ru/magicheskaia_bitva"
|
||||
ref = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
asyncio.run(main(url, ref))
|
||||
22
docker-compose.yml
Normal file
22
docker-compose.yml
Normal file
@@ -0,0 +1,22 @@
|
||||
version: "3.9"
|
||||
|
||||
services:
|
||||
manga:
|
||||
build: .
|
||||
image: manga-downloader:latest
|
||||
container_name: manga-downloader
|
||||
volumes:
|
||||
- ./output:/app/output
|
||||
- ./state:/app/state
|
||||
environment:
|
||||
- PYTHONUNBUFFERED=1
|
||||
# Chromium требует достаточно /dev/shm
|
||||
shm_size: "2gb"
|
||||
stdin_open: true
|
||||
tty: true
|
||||
# Переопределяется при запуске через:
|
||||
# docker compose run manga download <URL> --format cbz
|
||||
# docker compose run manga analyze <URL>
|
||||
command: ["--help"]
|
||||
|
||||
|
||||
10
requirements.txt
Normal file
10
requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
playwright==1.44.0
|
||||
click==8.1.7
|
||||
Pillow==10.3.0
|
||||
img2pdf==0.5.1
|
||||
ebooklib==0.18
|
||||
tqdm==4.66.4
|
||||
loguru==0.7.2
|
||||
|
||||
|
||||
|
||||
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
132
src/browser.py
Normal file
132
src/browser.py
Normal file
@@ -0,0 +1,132 @@
|
||||
"""
|
||||
Браузерный слой: запуск Playwright Chromium с антидетект-настройками.
|
||||
"""
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
||||
|
||||
|
||||
# Реалистичный User-Agent Chrome 124 Linux
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
# JavaScript-патч для скрытия признаков автоматизации
|
||||
STEALTH_JS = """
|
||||
() => {
|
||||
// Скрываем webdriver
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
|
||||
// Подменяем plugins (у headless = 0)
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
|
||||
// Подменяем languages
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['ru-RU', 'ru', 'en-US', 'en'],
|
||||
});
|
||||
|
||||
// Убираем chrome.runtime undefined (headless его не имеет)
|
||||
window.chrome = { runtime: {} };
|
||||
|
||||
// Фикс permissions API
|
||||
const originalQuery = window.navigator.permissions.query;
|
||||
window.navigator.permissions.query = (parameters) =>
|
||||
parameters.name === 'notifications'
|
||||
? Promise.resolve({ state: Notification.permission })
|
||||
: originalQuery(parameters);
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
class BrowserManager:
|
||||
"""Управляет жизненным циклом Playwright-браузера."""
|
||||
|
||||
def __init__(self, headless: bool = True, slow_mo: int = 0):
|
||||
self.headless = headless
|
||||
self.slow_mo = slow_mo
|
||||
self._playwright = None
|
||||
self._browser: Optional[Browser] = None
|
||||
|
||||
async def start(self):
|
||||
self._playwright = await async_playwright().start()
|
||||
self._browser = await self._playwright.chromium.launch(
|
||||
headless=self.headless,
|
||||
slow_mo=self.slow_mo,
|
||||
args=[
|
||||
"--no-sandbox",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-gpu",
|
||||
],
|
||||
)
|
||||
logger.info("Chromium запущен (headless={})", self.headless)
|
||||
|
||||
async def stop(self):
|
||||
if self._browser:
|
||||
await self._browser.close()
|
||||
if self._playwright:
|
||||
await self._playwright.stop()
|
||||
logger.info("Chromium остановлен")
|
||||
|
||||
async def new_context(self) -> BrowserContext:
|
||||
"""Создаёт новый контекст с реалистичными настройками."""
|
||||
context = await self._browser.new_context(
|
||||
user_agent=USER_AGENT,
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
locale="ru-RU",
|
||||
timezone_id="Europe/Moscow",
|
||||
extra_http_headers={
|
||||
"Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||
# Без Referer сервер возвращает 404 — требует "приход" с другой страницы сайта
|
||||
"Referer": "https://3.readmanga.ru/",
|
||||
},
|
||||
)
|
||||
# Применяем stealth-патч на каждую новую страницу
|
||||
await context.add_init_script(STEALTH_JS)
|
||||
return context
|
||||
|
||||
async def new_page(self) -> tuple[BrowserContext, Page]:
|
||||
ctx = await self.new_context()
|
||||
page = await ctx.new_page()
|
||||
return ctx, page
|
||||
|
||||
async def navigate(self, page: Page, url: str, timeout: int = 60_000,
|
||||
referer: str | None = None) -> bool:
|
||||
"""
|
||||
Открывает URL и ждёт загрузки.
|
||||
referer — явно выставляется в заголовке запроса (обход защиты сервера).
|
||||
Возвращает True при успехе.
|
||||
"""
|
||||
# Если referer не передан явно — берём домен из url
|
||||
if referer is None:
|
||||
from urllib.parse import urlparse
|
||||
p = urlparse(url)
|
||||
referer = f"{p.scheme}://{p.netloc}/"
|
||||
try:
|
||||
logger.debug("Навигация: {} (referer={})", url, referer)
|
||||
response = await page.goto(url, wait_until="domcontentloaded",
|
||||
timeout=timeout, referer=referer)
|
||||
if response and response.status >= 400:
|
||||
logger.warning("HTTP {}: {}", response.status, url)
|
||||
return False
|
||||
# Ждём завершения JS
|
||||
await page.wait_for_load_state("networkidle", timeout=timeout)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error("Ошибка навигации {}: {}", url, e)
|
||||
return False
|
||||
|
||||
async def __aenter__(self):
|
||||
await self.start()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *_):
|
||||
await self.stop()
|
||||
|
||||
|
||||
|
||||
245
src/cli.py
Normal file
245
src/cli.py
Normal file
@@ -0,0 +1,245 @@
|
||||
"""
|
||||
CLI точка входа.
|
||||
|
||||
Использование:
|
||||
python -m src.cli download <url> [опции]
|
||||
python -m src.cli analyze <url>
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
|
||||
from .browser import BrowserManager
|
||||
from .scraper import get_manga_info, get_chapter_images_and_download, Chapter
|
||||
from .exporter import export, ExportFormat
|
||||
from .state import StateDB
|
||||
|
||||
OUTPUT_DIR = Path("/app/output")
|
||||
STATE_DIR = Path("/app/state")
|
||||
|
||||
|
||||
# ── Настройка логирования ─────────────────────
|
||||
|
||||
def _setup_logging(verbose: bool):
|
||||
logger.remove()
|
||||
level = "DEBUG" if verbose else "INFO"
|
||||
logger.add(sys.stderr, level=level,
|
||||
format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | {message}")
|
||||
logger.add(STATE_DIR / "manga.log", level="DEBUG", rotation="10 MB")
|
||||
|
||||
|
||||
# ── CLI ───────────────────────────────────────
|
||||
|
||||
@click.group()
|
||||
@click.option("--verbose", "-v", is_flag=True, help="Подробный вывод")
|
||||
@click.pass_context
|
||||
def cli(ctx, verbose):
|
||||
ctx.ensure_object(dict)
|
||||
ctx.obj["verbose"] = verbose
|
||||
_setup_logging(verbose)
|
||||
|
||||
|
||||
# ── download ──────────────────────────────────
|
||||
|
||||
@cli.command()
|
||||
@click.argument("url")
|
||||
@click.option("--format", "-f", "fmt",
|
||||
type=click.Choice(["cbz", "pdf", "epub", "all"]),
|
||||
default="cbz", show_default=True,
|
||||
help="Формат вывода")
|
||||
@click.option("--chapters", "-c", default=None,
|
||||
help="Диапазон глав, напр. 1-10 или 5 или 1,3,7")
|
||||
@click.option("--output", "-o", default=str(OUTPUT_DIR),
|
||||
help="Папка для сохранения", show_default=True)
|
||||
@click.option("--resume/--no-resume", default=True,
|
||||
help="Пропускать уже скачанные главы")
|
||||
@click.option("--concurrency", default=4, show_default=True,
|
||||
help="Параллельных загрузок изображений")
|
||||
@click.pass_context
|
||||
def download(ctx, url, fmt, chapters, output, resume, concurrency):
|
||||
"""Скачать мангу по URL страницы."""
|
||||
asyncio.run(_download(
|
||||
url=url,
|
||||
fmt=fmt,
|
||||
chapters_filter=chapters,
|
||||
output_dir=Path(output),
|
||||
resume=resume,
|
||||
concurrency=concurrency,
|
||||
verbose=ctx.obj.get("verbose", False),
|
||||
))
|
||||
|
||||
|
||||
async def _download(url, fmt, chapters_filter, output_dir, resume, concurrency, verbose):
|
||||
db = StateDB()
|
||||
|
||||
async with BrowserManager(headless=True) as bm:
|
||||
ctx, page = await bm.new_page()
|
||||
|
||||
# 1. Получаем список глав
|
||||
manga = await get_manga_info(page, url)
|
||||
if not manga:
|
||||
logger.error("Не удалось получить информацию о манге")
|
||||
return
|
||||
|
||||
manga_dir = output_dir / _safe_name(manga.title)
|
||||
manga_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 2. Сохраняем все главы в БД
|
||||
for ch in manga.chapters:
|
||||
db.upsert_chapter(url, ch.url, ch.title, ch.number, ch.volume)
|
||||
|
||||
# 3. Фильтрация
|
||||
chapters = _filter_chapters(manga.chapters, chapters_filter)
|
||||
logger.info("Будет скачано глав: {}", len(chapters))
|
||||
|
||||
# 4. Форматы
|
||||
formats: list[ExportFormat] = ["cbz", "pdf", "epub"] if fmt == "all" else [fmt]
|
||||
|
||||
# 5. Скачиваем каждую главу
|
||||
with tqdm(total=len(chapters), desc="Главы", unit="гл") as pbar:
|
||||
for ch in chapters:
|
||||
pbar.set_description(f"Глава {ch.number}: {ch.title[:30]}")
|
||||
|
||||
# Проверяем статус (resume)
|
||||
if resume and db.chapter_status(ch.url) == "done":
|
||||
logger.info("Пропускаем (уже скачана): {}", ch.title)
|
||||
pbar.update(1)
|
||||
continue
|
||||
|
||||
await _process_chapter(
|
||||
bm=bm, ctx=ctx, ch=ch,
|
||||
manga_url=url,
|
||||
manga_dir=manga_dir, formats=formats,
|
||||
concurrency=concurrency, db=db,
|
||||
)
|
||||
pbar.update(1)
|
||||
|
||||
logger.info("✅ Готово! Файлы в: {}", manga_dir)
|
||||
await ctx.close()
|
||||
db.close()
|
||||
|
||||
|
||||
async def _process_chapter(bm, ctx, ch: Chapter, manga_url: str, manga_dir: Path,
|
||||
formats: list, concurrency: int, db: StateDB):
|
||||
# Новая страница для каждой главы (чистый контекст)
|
||||
ch_page = await ctx.new_page()
|
||||
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmp_path = Path(tmpdir)
|
||||
# Открываем главу и скачиваем изображения за один проход
|
||||
image_paths = await get_chapter_images_and_download(
|
||||
ch_page, ch.url, dest_dir=tmp_path, manga_url=manga_url
|
||||
)
|
||||
|
||||
if not image_paths:
|
||||
logger.error("Нет скачанных изображений: {}", ch.title)
|
||||
db.mark_failed(ch.url)
|
||||
return
|
||||
|
||||
ch_name = _safe_chapter_name(ch)
|
||||
|
||||
for fmt in formats:
|
||||
out_file = manga_dir / f"{ch_name}.{fmt}"
|
||||
try:
|
||||
export(image_paths, out_file, fmt, manga_dir.name, ch.title)
|
||||
db.mark_done(ch.url, fmt, str(out_file))
|
||||
except Exception as e:
|
||||
logger.error("Ошибка экспорта {}: {}", fmt, e)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Ошибка обработки главы {}: {}", ch.title, e)
|
||||
db.mark_failed(ch.url)
|
||||
finally:
|
||||
await ch_page.close()
|
||||
|
||||
|
||||
# ── analyze ───────────────────────────────────
|
||||
|
||||
@cli.command()
|
||||
@click.argument("url")
|
||||
@click.pass_context
|
||||
def analyze(ctx, url):
|
||||
"""Анализировать сайт и вывести список глав (без скачивания)."""
|
||||
asyncio.run(_analyze(url))
|
||||
|
||||
|
||||
async def _analyze(url: str):
|
||||
async with BrowserManager(headless=True) as bm:
|
||||
_, page = await bm.new_page()
|
||||
manga = await get_manga_info(page, url)
|
||||
|
||||
if not manga:
|
||||
click.echo("❌ Не удалось получить информацию")
|
||||
return
|
||||
|
||||
click.echo(f"\n📚 Манга: {manga.title}")
|
||||
click.echo(f"🔗 URL: {manga.url}")
|
||||
click.echo(f"📖 Глав: {len(manga.chapters)}\n")
|
||||
|
||||
for ch in manga.chapters[:20]:
|
||||
click.echo(f" Том {ch.volume:02d} Гл. {ch.number:06.1f} {ch.title}")
|
||||
|
||||
if len(manga.chapters) > 20:
|
||||
click.echo(f" ... и ещё {len(manga.chapters) - 20} глав")
|
||||
|
||||
# Проверяем одну главу
|
||||
if manga.chapters:
|
||||
first = manga.chapters[-1]
|
||||
click.echo(f"\n🔍 Проверяем первую главу: {first.url}")
|
||||
import tempfile
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
paths = await get_chapter_images_and_download(
|
||||
page, first.url, dest_dir=Path(tmp), manga_url=url
|
||||
)
|
||||
click.echo(f" Скачано изображений: {len(paths)}")
|
||||
for p in paths[:3]:
|
||||
click.echo(f" {p.name} ({p.stat().st_size} байт)")
|
||||
|
||||
|
||||
# ── Утилиты ───────────────────────────────────
|
||||
|
||||
def _safe_name(s: str) -> str:
|
||||
return re.sub(r'[^\w\s\-]', '', s).strip().replace(" ", "_")[:80]
|
||||
|
||||
|
||||
def _safe_chapter_name(ch: Chapter) -> str:
|
||||
vol = f"v{ch.volume:02d}_" if ch.volume else ""
|
||||
return f"{vol}ch{ch.number:06.1f}"
|
||||
|
||||
|
||||
def _filter_chapters(chapters: list[Chapter], filter_str: str | None) -> list[Chapter]:
|
||||
if not filter_str:
|
||||
return chapters
|
||||
|
||||
# "1-10" → диапазон
|
||||
m = re.match(r"^(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)$", filter_str)
|
||||
if m:
|
||||
lo, hi = float(m.group(1)), float(m.group(2))
|
||||
return [c for c in chapters if lo <= c.number <= hi]
|
||||
|
||||
# "1,3,7" → список
|
||||
nums = {float(x.strip()) for x in filter_str.split(",")}
|
||||
return [c for c in chapters if c.number in nums]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
106
src/downloader.py
Normal file
106
src/downloader.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""
|
||||
Загрузчик изображений через Playwright response interception.
|
||||
Браузер сам загружает все картинки (умеет в CDN 300-редиректы).
|
||||
Мы перехватываем байты прямо из сетевых ответов.
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
from playwright.async_api import Page, Response
|
||||
|
||||
|
||||
async def download_images(
|
||||
context, # BrowserContext (не используется, для совместимости)
|
||||
image_urls: list[str],
|
||||
dest_dir: Path,
|
||||
concurrency: int = 4,
|
||||
chapter_url: str = "https://3.readmanga.ru/",
|
||||
page: Optional[Page] = None,
|
||||
) -> list[Path]:
|
||||
"""
|
||||
Перехватывает и сохраняет изображения которые браузер уже загрузил.
|
||||
Затем доскачивает оставшиеся через скролл страницы.
|
||||
"""
|
||||
if page is None:
|
||||
raise ValueError("page обязателен")
|
||||
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _base(url: str) -> str:
|
||||
return url.split("?")[0]
|
||||
|
||||
url_to_idx = {_base(u): i for i, u in enumerate(image_urls)}
|
||||
saved: dict[int, Path] = {}
|
||||
|
||||
async def _capture_response(response: Response):
|
||||
base = _base(response.url)
|
||||
if base not in url_to_idx:
|
||||
return
|
||||
idx = url_to_idx[base]
|
||||
if idx in saved:
|
||||
return
|
||||
ct = response.headers.get("content-type", "")
|
||||
if not any(t in ct for t in ("image/", "application/octet")):
|
||||
if not re.search(r"\.(jpg|jpeg|png|webp)", base, re.I):
|
||||
return
|
||||
if response.status not in (200, 206):
|
||||
return
|
||||
try:
|
||||
body = await response.body()
|
||||
if not body:
|
||||
return
|
||||
ext = _get_ext(response.url)
|
||||
path = dest_dir / f"{idx:04d}{ext}"
|
||||
path.write_bytes(body)
|
||||
saved[idx] = path
|
||||
logger.debug("Перехвачена стр. {} ({} байт)", idx + 1, len(body))
|
||||
except Exception as e:
|
||||
logger.debug("Не удалось захватить стр. {}: {}", idx + 1, e)
|
||||
|
||||
page.on("response", _capture_response)
|
||||
|
||||
await _scroll_to_trigger_load(page)
|
||||
|
||||
for _ in range(30):
|
||||
await asyncio.sleep(1)
|
||||
if len(saved) >= len(image_urls):
|
||||
break
|
||||
|
||||
page.remove_listener("response", _capture_response)
|
||||
logger.info("Перехвачено изображений: {}/{}", len(saved), len(image_urls))
|
||||
|
||||
paths = []
|
||||
for idx in range(len(image_urls)):
|
||||
if idx in saved:
|
||||
paths.append(saved[idx])
|
||||
else:
|
||||
logger.warning("Страница {} не была загружена браузером", idx + 1)
|
||||
|
||||
return paths
|
||||
|
||||
|
||||
async def _scroll_to_trigger_load(page: Page):
|
||||
try:
|
||||
height = await page.evaluate("document.body.scrollHeight")
|
||||
step = 600
|
||||
pos = 0
|
||||
while pos < height:
|
||||
await page.evaluate(f"window.scrollTo(0, {pos})")
|
||||
await asyncio.sleep(0.15)
|
||||
pos += step
|
||||
height = await page.evaluate("document.body.scrollHeight")
|
||||
await page.evaluate("window.scrollTo(0, 0)")
|
||||
except Exception as e:
|
||||
logger.debug("Ошибка скролла: {}", e)
|
||||
|
||||
|
||||
def _get_ext(url: str) -> str:
|
||||
m = re.search(r"\.(jpg|jpeg|png|webp)(\?|$)", url, re.IGNORECASE)
|
||||
if m:
|
||||
ext = m.group(1).lower()
|
||||
return ".jpg" if ext == "jpeg" else f".{ext}"
|
||||
return ".jpg"
|
||||
|
||||
127
src/exporter.py
Normal file
127
src/exporter.py
Normal file
@@ -0,0 +1,127 @@
|
||||
"""
|
||||
Экспорт в CBZ, PDF, EPUB.
|
||||
"""
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
from loguru import logger
|
||||
|
||||
ExportFormat = Literal["cbz", "pdf", "epub"]
|
||||
|
||||
|
||||
def export(
|
||||
image_paths: list[Path],
|
||||
output_path: Path,
|
||||
fmt: ExportFormat,
|
||||
title: str = "Manga",
|
||||
chapter: str = "",
|
||||
):
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
logger.info("Экспортирую {} страниц → {} ({})", len(image_paths), output_path.name, fmt)
|
||||
|
||||
if fmt == "cbz":
|
||||
_export_cbz(image_paths, output_path)
|
||||
elif fmt == "pdf":
|
||||
_export_pdf(image_paths, output_path)
|
||||
elif fmt == "epub":
|
||||
_export_epub(image_paths, output_path, title, chapter)
|
||||
else:
|
||||
raise ValueError(f"Неизвестный формат: {fmt}")
|
||||
|
||||
logger.info("Сохранено: {}", output_path)
|
||||
|
||||
|
||||
# ── CBZ ───────────────────────────────────────
|
||||
|
||||
def _export_cbz(images: list[Path], out: Path):
|
||||
with zipfile.ZipFile(out, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
||||
for i, img in enumerate(images):
|
||||
zf.write(img, f"{i:04d}{img.suffix}")
|
||||
|
||||
|
||||
# ── PDF ───────────────────────────────────────
|
||||
|
||||
def _export_pdf(images: list[Path], out: Path):
|
||||
try:
|
||||
import img2pdf
|
||||
with open(out, "wb") as f:
|
||||
f.write(img2pdf.convert([str(p) for p in images]))
|
||||
except Exception as e:
|
||||
logger.warning("img2pdf не сработал ({}), использую Pillow", e)
|
||||
_export_pdf_pillow(images, out)
|
||||
|
||||
|
||||
def _export_pdf_pillow(images: list[Path], out: Path):
|
||||
from PIL import Image
|
||||
pil_images = []
|
||||
for p in images:
|
||||
img = Image.open(p).convert("RGB")
|
||||
pil_images.append(img)
|
||||
if pil_images:
|
||||
pil_images[0].save(
|
||||
out,
|
||||
save_all=True,
|
||||
append_images=pil_images[1:],
|
||||
format="PDF",
|
||||
)
|
||||
|
||||
|
||||
# ── EPUB ──────────────────────────────────────
|
||||
|
||||
def _export_epub(images: list[Path], out: Path, title: str, chapter: str):
|
||||
from ebooklib import epub
|
||||
from PIL import Image
|
||||
import base64
|
||||
|
||||
book = epub.EpubBook()
|
||||
book.set_identifier(f"manga-{title}-{chapter}".replace(" ", "-"))
|
||||
book.set_title(f"{title} — {chapter}" if chapter else title)
|
||||
book.set_language("ru")
|
||||
|
||||
spine = ["nav"]
|
||||
toc = []
|
||||
|
||||
for i, img_path in enumerate(images):
|
||||
# Добавляем изображение в книгу
|
||||
with open(img_path, "rb") as f:
|
||||
img_data = f.read()
|
||||
|
||||
img_name = f"images/page_{i:04d}{img_path.suffix}"
|
||||
epub_img = epub.EpubImage()
|
||||
epub_img.file_name = img_name
|
||||
epub_img.media_type = _mime(img_path.suffix)
|
||||
epub_img.content = img_data
|
||||
book.add_item(epub_img)
|
||||
|
||||
# HTML-страница для каждого изображения
|
||||
page_html = epub.EpubHtml(
|
||||
title=f"Страница {i + 1}",
|
||||
file_name=f"page_{i:04d}.xhtml",
|
||||
lang="ru",
|
||||
)
|
||||
page_html.content = (
|
||||
f'<html><body style="margin:0;padding:0;">'
|
||||
f'<img src="{img_name}" style="max-width:100%;height:auto;display:block;margin:auto;"/>'
|
||||
f'</body></html>'
|
||||
)
|
||||
book.add_item(page_html)
|
||||
spine.append(page_html)
|
||||
toc.append(epub.Link(f"page_{i:04d}.xhtml", f"Страница {i + 1}", f"page{i}"))
|
||||
|
||||
book.toc = toc
|
||||
book.spine = spine
|
||||
book.add_item(epub.EpubNcx())
|
||||
book.add_item(epub.EpubNav())
|
||||
|
||||
epub.write_epub(str(out), book)
|
||||
|
||||
|
||||
def _mime(ext: str) -> str:
|
||||
return {
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".png": "image/png",
|
||||
".webp": "image/webp",
|
||||
}.get(ext.lower(), "image/jpeg")
|
||||
|
||||
364
src/scraper.py
Normal file
364
src/scraper.py
Normal file
@@ -0,0 +1,364 @@
|
||||
"""
|
||||
Парсер readmanga.ru: список глав и URL/байты изображений внутри главы.
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
from playwright.async_api import Page
|
||||
|
||||
from .browser import BrowserManager
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Модели данных
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class Chapter:
|
||||
title: str
|
||||
url: str
|
||||
number: float = 0.0
|
||||
volume: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class MangaInfo:
|
||||
title: str
|
||||
url: str
|
||||
chapters: list[Chapter] = field(default_factory=list)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Страница манги — список глав
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def get_manga_info(page: Page, url: str) -> Optional[MangaInfo]:
|
||||
"""Открывает страницу манги и возвращает список всех глав."""
|
||||
logger.info("Загружаем страницу манги: {}", url)
|
||||
ok = await _navigate(page, url)
|
||||
if not ok:
|
||||
return None
|
||||
|
||||
title = await page.title()
|
||||
title = re.sub(r"\s*[-–|].*$", "", title).strip()
|
||||
logger.info("Манга: {}", title)
|
||||
|
||||
await _expand_chapters(page)
|
||||
chapters = await _extract_chapters(page)
|
||||
if not chapters:
|
||||
chapters = await _extract_chapters_alt(page)
|
||||
|
||||
logger.info("Найдено глав: {}", len(chapters))
|
||||
return MangaInfo(title=title, url=url, chapters=chapters)
|
||||
|
||||
|
||||
async def _navigate(page: Page, url: str, retries: int = 3,
|
||||
referer: str | None = None) -> bool:
|
||||
from urllib.parse import urlparse
|
||||
if referer is None:
|
||||
p = urlparse(url)
|
||||
referer = f"{p.scheme}://{p.netloc}/"
|
||||
for attempt in range(1, retries + 1):
|
||||
try:
|
||||
resp = await page.goto(url, wait_until="domcontentloaded",
|
||||
timeout=60_000, referer=referer)
|
||||
if resp and resp.status >= 400:
|
||||
logger.warning("Попытка {}/{}: HTTP {}", attempt, retries, resp.status)
|
||||
await asyncio.sleep(3 * attempt)
|
||||
continue
|
||||
try:
|
||||
await page.wait_for_load_state("networkidle", timeout=10_000)
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning("Попытка {}/{}: {}", attempt, retries, e)
|
||||
await asyncio.sleep(3 * attempt)
|
||||
return False
|
||||
|
||||
|
||||
async def _expand_chapters(page: Page):
|
||||
for sel in ["a.chapter-link.all", "button:has-text('Все главы')",
|
||||
"a:has-text('Все главы')"]:
|
||||
try:
|
||||
el = page.locator(sel).first
|
||||
if await el.is_visible(timeout=2000):
|
||||
await el.click()
|
||||
await page.wait_for_load_state("networkidle", timeout=10_000)
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
async def _extract_chapters(page: Page) -> list[Chapter]:
|
||||
"""Основной парсер: #chapters-list → tr.item-row → td[data-num] a.chapter-link"""
|
||||
rows = await page.query_selector_all("#chapters-list tr.item-row")
|
||||
chapters = []
|
||||
for row in rows:
|
||||
link = await row.query_selector("td[class*='item-title'] a")
|
||||
if not link:
|
||||
continue
|
||||
href = await link.get_attribute("href") or ""
|
||||
text = (await link.inner_text()).strip()
|
||||
if not href:
|
||||
continue
|
||||
td = await row.query_selector("td[data-num]")
|
||||
vol = int((await td.get_attribute("data-vol") or "0")) if td else 0
|
||||
num_raw = int((await td.get_attribute("data-num") or "0")) if td else 0
|
||||
number = num_raw / 10.0
|
||||
full_url = href if href.startswith("http") else _base_url(page.url) + href
|
||||
chapters.append(Chapter(title=text, url=full_url, number=number, volume=vol))
|
||||
return chapters
|
||||
|
||||
|
||||
async def _extract_chapters_alt(page: Page) -> list[Chapter]:
|
||||
result = await page.evaluate("""
|
||||
() => {
|
||||
const links = Array.from(document.querySelectorAll('a[href*="/vol"]'));
|
||||
return links.map(a => ({ href: a.href, text: a.textContent.trim() }))
|
||||
.filter(x => x.href && x.text);
|
||||
}
|
||||
""")
|
||||
return [Chapter(title=x["text"], url=x["href"],
|
||||
number=_parse_num(x["text"]), volume=_parse_vol(x["text"]))
|
||||
for x in result]
|
||||
|
||||
|
||||
def _base_url(url: str) -> str:
|
||||
m = re.match(r"(https?://[^/]+)", url)
|
||||
return m.group(1) if m else "https://readmanga.ru"
|
||||
|
||||
|
||||
def _parse_num(text: str) -> float:
|
||||
m = re.search(r"[\d]+(?:[.,]\d+)?", text.replace(",", "."))
|
||||
return float(m.group()) if m else 0.0
|
||||
|
||||
|
||||
def _parse_vol(text: str) -> int:
|
||||
m = re.search(r"Том\s+(\d+)", text, re.IGNORECASE)
|
||||
return int(m.group(1)) if m else 0
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Страница главы — получение URL изображений
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def _extract_images_from_js(page: Page) -> list[str]:
|
||||
"""
|
||||
Извлекает URL из rm_h.readerInit(chapterInfo, [[base, '', path, w, h], ...]).
|
||||
Считает скобки для точного захвата массива.
|
||||
"""
|
||||
try:
|
||||
result = await page.evaluate("""
|
||||
() => {
|
||||
for (const s of document.querySelectorAll('script')) {
|
||||
const text = s.textContent || '';
|
||||
const mi = text.indexOf('readerInit');
|
||||
if (mi === -1) continue;
|
||||
const ai = text.indexOf('[', mi);
|
||||
if (ai === -1) continue;
|
||||
let depth = 0, end = -1;
|
||||
for (let i = ai; i < text.length; i++) {
|
||||
if (text[i] === '[') depth++;
|
||||
else if (text[i] === ']') { depth--; if (!depth) { end = i+1; break; } }
|
||||
}
|
||||
if (end === -1) continue;
|
||||
try {
|
||||
const arr = eval(text.slice(ai, end));
|
||||
if (Array.isArray(arr) && arr.length)
|
||||
return arr.map(item => Array.isArray(item) && item.length >= 3
|
||||
? item[0] + item[2] : null).filter(Boolean);
|
||||
} catch(e) {}
|
||||
}
|
||||
return [];
|
||||
}
|
||||
""")
|
||||
if result:
|
||||
logger.debug("JS readerInit нашёл {} изображений", len(result))
|
||||
return result or []
|
||||
except Exception as e:
|
||||
logger.debug("JS-метод не сработал: {}", e)
|
||||
return []
|
||||
|
||||
|
||||
async def _extract_images_from_dom(page: Page) -> list[str]:
|
||||
try:
|
||||
result = await page.evaluate("""
|
||||
() => {
|
||||
for (const sel of ['img.manga-page', '.page-image img', '#mangaReader img', 'img[data-src]']) {
|
||||
const found = Array.from(document.querySelectorAll(sel));
|
||||
if (found.length) return found.map(i => i.src || i.dataset.src).filter(Boolean);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
""")
|
||||
return result or []
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _get_ext(url: str) -> str:
|
||||
m = re.search(r"\.(jpg|jpeg|png|webp)(\?|$)", url, re.IGNORECASE)
|
||||
if m:
|
||||
ext = m.group(1).lower()
|
||||
return ".jpg" if ext == "jpeg" else f".{ext}"
|
||||
return ".jpg"
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Скачивание главы
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def get_chapter_images_and_download(
|
||||
page: Page,
|
||||
chapter_url: str,
|
||||
dest_dir: Path,
|
||||
manga_url: str | None = None,
|
||||
) -> list[Path]:
|
||||
"""
|
||||
1. Открывает страницу главы (устанавливает DDoS-Guard cookies для CDN).
|
||||
2. Извлекает список URL из readerInit.
|
||||
3. Перехватывает img-запросы через page.route() + route.fetch()
|
||||
(браузерный стек — правильные Sec-Fetch-* заголовки, cookies).
|
||||
4. Пролистывает читалку клавишей ArrowRight чтобы загрузить все страницы.
|
||||
"""
|
||||
logger.info("Загружаем главу: {}", chapter_url)
|
||||
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(chapter_url)
|
||||
parts = parsed.path.strip("/").split("/")
|
||||
manga_slug = parts[0] if parts else ""
|
||||
referer = manga_url or f"{parsed.scheme}://{parsed.netloc}/{manga_slug}"
|
||||
|
||||
load_url = chapter_url + ("?mtr=1" if "?" not in chapter_url else "&mtr=1")
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _base(u: str) -> str:
|
||||
return u.split("?")[0]
|
||||
|
||||
# CDN домены которые хостят изображения манги (не статику сайта)
|
||||
CDN_RE = re.compile(r"(?<!\bstatic\b)(^|[./])one-way\.work|staticfa\.|cdnmanga|reimg", re.I)
|
||||
IMG_RE = re.compile(r"\.(jpg|jpeg|png|webp)(\?|$)", re.I)
|
||||
|
||||
# Более точный фильтр: только image-хосты, не resrmr/статика
|
||||
def _is_manga_image(url: str) -> bool:
|
||||
base = _base(url)
|
||||
if not IMG_RE.search(base):
|
||||
return False
|
||||
# Исключаем статику сайта (логотипы, иконки, шрифты)
|
||||
if "resrmr." in url or "/static/" in url:
|
||||
return False
|
||||
# Принимаем image CDN
|
||||
return bool(re.search(r"one-way\.work|staticfa\.|rm\.one-way|cdnmanga|reimg", url, re.I))
|
||||
|
||||
captured: dict[str, bytes] = {} # base_url → bytes
|
||||
lock = asyncio.Lock()
|
||||
|
||||
async def route_handler(route, request):
|
||||
url = request.url
|
||||
base = _base(url)
|
||||
if not _is_manga_image(url):
|
||||
await route.continue_()
|
||||
return
|
||||
# Уже есть — пропускаем
|
||||
async with lock:
|
||||
already = base in captured
|
||||
if already:
|
||||
await route.continue_()
|
||||
return
|
||||
try:
|
||||
response = await route.fetch()
|
||||
body = await response.body()
|
||||
if body and len(body) > 500 and response.status in (200, 206):
|
||||
async with lock:
|
||||
if base not in captured:
|
||||
captured[base] = body
|
||||
logger.debug("✓ {}: {} байт", base.split("/")[-1], len(body))
|
||||
await route.fulfill(response=response)
|
||||
except Exception as e:
|
||||
logger.debug("route.fetch {}: {}", base[-40:], e)
|
||||
try:
|
||||
await route.continue_()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
await page.route("**/*", route_handler)
|
||||
|
||||
# 1. Открываем главу
|
||||
ok = await _navigate(page, load_url, referer=referer)
|
||||
if not ok:
|
||||
await page.unroute("**/*", route_handler)
|
||||
logger.error("Не удалось открыть главу: {}", chapter_url)
|
||||
return []
|
||||
|
||||
# 2. Ждём readerInit
|
||||
try:
|
||||
await page.wait_for_function(
|
||||
"() => Array.from(document.querySelectorAll('script'))"
|
||||
".some(s => s.textContent.includes('readerInit'))",
|
||||
timeout=15_000,
|
||||
)
|
||||
except Exception:
|
||||
logger.debug("readerInit не появился за 15с")
|
||||
|
||||
# 3. Извлекаем список URL
|
||||
image_urls = await _extract_images_from_js(page)
|
||||
if not image_urls:
|
||||
image_urls = await _extract_images_from_dom(page)
|
||||
if not image_urls:
|
||||
await page.unroute("**/*", route_handler)
|
||||
logger.error("Список изображений пуст: {}", chapter_url)
|
||||
return []
|
||||
|
||||
logger.info("Найдено изображений: {}", len(image_urls))
|
||||
url_to_idx = {_base(u): i for i, u in enumerate(image_urls)}
|
||||
total = len(image_urls)
|
||||
|
||||
# 4. Пролистываем читалку — reader грузит страницы по мере листания
|
||||
await asyncio.sleep(1)
|
||||
for i in range(total + 10):
|
||||
async with lock:
|
||||
done = len(captured)
|
||||
if done >= total:
|
||||
break
|
||||
try:
|
||||
await page.keyboard.press("ArrowRight")
|
||||
await asyncio.sleep(0.5)
|
||||
except Exception:
|
||||
break
|
||||
if i % 20 == 19:
|
||||
async with lock:
|
||||
done = len(captured)
|
||||
logger.debug("Пролистано {}, загружено: {}/{}", i + 1, done, total)
|
||||
|
||||
# Финальное ожидание
|
||||
await asyncio.sleep(3)
|
||||
await page.unroute("**/*", route_handler)
|
||||
|
||||
async with lock:
|
||||
done = len(captured)
|
||||
logger.info("Перехвачено: {}/{}", done, total)
|
||||
|
||||
# 5. Сохраняем в правильном порядке
|
||||
paths: dict[int, Path] = {}
|
||||
for base_url, body in captured.items():
|
||||
if base_url not in url_to_idx:
|
||||
continue
|
||||
idx = url_to_idx[base_url]
|
||||
ext = _get_ext(base_url)
|
||||
p = dest_dir / f"{idx:04d}{ext}"
|
||||
p.write_bytes(body)
|
||||
paths[idx] = p
|
||||
|
||||
missing = total - len(paths)
|
||||
if missing:
|
||||
logger.warning("Не загружено страниц: {}", missing)
|
||||
|
||||
return [paths[i] for i in sorted(paths.keys())]
|
||||
|
||||
|
||||
|
||||
92
src/state.py
Normal file
92
src/state.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""
|
||||
Хранение состояния скачивания в SQLite.
|
||||
"""
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
DB_PATH = Path("/app/state/progress.db")
|
||||
|
||||
|
||||
class StateDB:
|
||||
def __init__(self, db_path: Path = DB_PATH):
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.conn = sqlite3.connect(str(db_path))
|
||||
self._init()
|
||||
|
||||
def _init(self):
|
||||
self.conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS chapters (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
manga_url TEXT NOT NULL,
|
||||
chapter_url TEXT NOT NULL UNIQUE,
|
||||
title TEXT,
|
||||
number REAL,
|
||||
volume INTEGER,
|
||||
status TEXT DEFAULT 'pending',
|
||||
output_cbz TEXT,
|
||||
output_pdf TEXT,
|
||||
output_epub TEXT,
|
||||
updated_at TEXT
|
||||
)
|
||||
""")
|
||||
self.conn.commit()
|
||||
|
||||
def upsert_chapter(self, manga_url: str, chapter_url: str,
|
||||
title: str = "", number: float = 0, volume: int = 0):
|
||||
self.conn.execute("""
|
||||
INSERT INTO chapters (manga_url, chapter_url, title, number, volume, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(chapter_url) DO UPDATE SET
|
||||
title = excluded.title,
|
||||
number = excluded.number,
|
||||
volume = excluded.volume
|
||||
""", (manga_url, chapter_url, title, number, volume, _now()))
|
||||
self.conn.commit()
|
||||
|
||||
def mark_done(self, chapter_url: str, fmt: str, output_path: str):
|
||||
col = f"output_{fmt}"
|
||||
self.conn.execute(f"""
|
||||
UPDATE chapters SET status='done', {col}=?, updated_at=?
|
||||
WHERE chapter_url=?
|
||||
""", (output_path, _now(), chapter_url))
|
||||
self.conn.commit()
|
||||
|
||||
def mark_failed(self, chapter_url: str):
|
||||
self.conn.execute("""
|
||||
UPDATE chapters SET status='failed', updated_at=? WHERE chapter_url=?
|
||||
""", (_now(), chapter_url))
|
||||
self.conn.commit()
|
||||
|
||||
def get_pending(self, manga_url: str) -> list[dict]:
|
||||
cur = self.conn.execute("""
|
||||
SELECT chapter_url, title, number, volume
|
||||
FROM chapters
|
||||
WHERE manga_url=? AND status != 'done'
|
||||
ORDER BY volume, number
|
||||
""", (manga_url,))
|
||||
cols = [d[0] for d in cur.description]
|
||||
return [dict(zip(cols, row)) for row in cur.fetchall()]
|
||||
|
||||
def get_all(self, manga_url: str) -> list[dict]:
|
||||
cur = self.conn.execute("""
|
||||
SELECT * FROM chapters WHERE manga_url=? ORDER BY volume, number
|
||||
""", (manga_url,))
|
||||
cols = [d[0] for d in cur.description]
|
||||
return [dict(zip(cols, row)) for row in cur.fetchall()]
|
||||
|
||||
def chapter_status(self, chapter_url: str) -> Optional[str]:
|
||||
cur = self.conn.execute(
|
||||
"SELECT status FROM chapters WHERE chapter_url=?", (chapter_url,))
|
||||
row = cur.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
def close(self):
|
||||
self.conn.close()
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.utcnow().isoformat()
|
||||
|
||||
Reference in New Issue
Block a user