This commit is contained in:
2026-04-29 01:53:16 +03:00
commit ba6bfc5ed3
14 changed files with 1338 additions and 0 deletions

0
src/__init__.py Normal file
View File

132
src/browser.py Normal file
View File

@@ -0,0 +1,132 @@
"""
Браузерный слой: запуск Playwright Chromium с антидетект-настройками.
"""
import asyncio
from typing import Optional
from loguru import logger
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
# Реалистичный User-Agent Chrome 124 Linux
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
# JavaScript-патч для скрытия признаков автоматизации
STEALTH_JS = """
() => {
// Скрываем webdriver
Object.defineProperty(navigator, 'webdriver', { get: () => false });
// Подменяем plugins (у headless = 0)
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
// Подменяем languages
Object.defineProperty(navigator, 'languages', {
get: () => ['ru-RU', 'ru', 'en-US', 'en'],
});
// Убираем chrome.runtime undefined (headless его не имеет)
window.chrome = { runtime: {} };
// Фикс permissions API
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) =>
parameters.name === 'notifications'
? Promise.resolve({ state: Notification.permission })
: originalQuery(parameters);
}
"""
class BrowserManager:
"""Управляет жизненным циклом Playwright-браузера."""
def __init__(self, headless: bool = True, slow_mo: int = 0):
self.headless = headless
self.slow_mo = slow_mo
self._playwright = None
self._browser: Optional[Browser] = None
async def start(self):
self._playwright = await async_playwright().start()
self._browser = await self._playwright.chromium.launch(
headless=self.headless,
slow_mo=self.slow_mo,
args=[
"--no-sandbox",
"--disable-blink-features=AutomationControlled",
"--disable-dev-shm-usage",
"--disable-gpu",
],
)
logger.info("Chromium запущен (headless={})", self.headless)
async def stop(self):
if self._browser:
await self._browser.close()
if self._playwright:
await self._playwright.stop()
logger.info("Chromium остановлен")
async def new_context(self) -> BrowserContext:
"""Создаёт новый контекст с реалистичными настройками."""
context = await self._browser.new_context(
user_agent=USER_AGENT,
viewport={"width": 1920, "height": 1080},
locale="ru-RU",
timezone_id="Europe/Moscow",
extra_http_headers={
"Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
# Без Referer сервер возвращает 404 — требует "приход" с другой страницы сайта
"Referer": "https://3.readmanga.ru/",
},
)
# Применяем stealth-патч на каждую новую страницу
await context.add_init_script(STEALTH_JS)
return context
async def new_page(self) -> tuple[BrowserContext, Page]:
ctx = await self.new_context()
page = await ctx.new_page()
return ctx, page
async def navigate(self, page: Page, url: str, timeout: int = 60_000,
referer: str | None = None) -> bool:
"""
Открывает URL и ждёт загрузки.
referer — явно выставляется в заголовке запроса (обход защиты сервера).
Возвращает True при успехе.
"""
# Если referer не передан явно — берём домен из url
if referer is None:
from urllib.parse import urlparse
p = urlparse(url)
referer = f"{p.scheme}://{p.netloc}/"
try:
logger.debug("Навигация: {} (referer={})", url, referer)
response = await page.goto(url, wait_until="domcontentloaded",
timeout=timeout, referer=referer)
if response and response.status >= 400:
logger.warning("HTTP {}: {}", response.status, url)
return False
# Ждём завершения JS
await page.wait_for_load_state("networkidle", timeout=timeout)
return True
except Exception as e:
logger.error("Ошибка навигации {}: {}", url, e)
return False
async def __aenter__(self):
await self.start()
return self
async def __aexit__(self, *_):
await self.stop()

245
src/cli.py Normal file
View File

@@ -0,0 +1,245 @@
"""
CLI точка входа.
Использование:
python -m src.cli download <url> [опции]
python -m src.cli analyze <url>
"""
import asyncio
import re
import sys
import tempfile
from pathlib import Path
import click
from loguru import logger
from tqdm import tqdm
from .browser import BrowserManager
from .scraper import get_manga_info, get_chapter_images_and_download, Chapter
from .exporter import export, ExportFormat
from .state import StateDB
OUTPUT_DIR = Path("/app/output")
STATE_DIR = Path("/app/state")
# ── Настройка логирования ─────────────────────
def _setup_logging(verbose: bool):
logger.remove()
level = "DEBUG" if verbose else "INFO"
logger.add(sys.stderr, level=level,
format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | {message}")
logger.add(STATE_DIR / "manga.log", level="DEBUG", rotation="10 MB")
# ── CLI ───────────────────────────────────────
@click.group()
@click.option("--verbose", "-v", is_flag=True, help="Подробный вывод")
@click.pass_context
def cli(ctx, verbose):
ctx.ensure_object(dict)
ctx.obj["verbose"] = verbose
_setup_logging(verbose)
# ── download ──────────────────────────────────
@cli.command()
@click.argument("url")
@click.option("--format", "-f", "fmt",
type=click.Choice(["cbz", "pdf", "epub", "all"]),
default="cbz", show_default=True,
help="Формат вывода")
@click.option("--chapters", "-c", default=None,
help="Диапазон глав, напр. 1-10 или 5 или 1,3,7")
@click.option("--output", "-o", default=str(OUTPUT_DIR),
help="Папка для сохранения", show_default=True)
@click.option("--resume/--no-resume", default=True,
help="Пропускать уже скачанные главы")
@click.option("--concurrency", default=4, show_default=True,
help="Параллельных загрузок изображений")
@click.pass_context
def download(ctx, url, fmt, chapters, output, resume, concurrency):
"""Скачать мангу по URL страницы."""
asyncio.run(_download(
url=url,
fmt=fmt,
chapters_filter=chapters,
output_dir=Path(output),
resume=resume,
concurrency=concurrency,
verbose=ctx.obj.get("verbose", False),
))
async def _download(url, fmt, chapters_filter, output_dir, resume, concurrency, verbose):
db = StateDB()
async with BrowserManager(headless=True) as bm:
ctx, page = await bm.new_page()
# 1. Получаем список глав
manga = await get_manga_info(page, url)
if not manga:
logger.error("Не удалось получить информацию о манге")
return
manga_dir = output_dir / _safe_name(manga.title)
manga_dir.mkdir(parents=True, exist_ok=True)
# 2. Сохраняем все главы в БД
for ch in manga.chapters:
db.upsert_chapter(url, ch.url, ch.title, ch.number, ch.volume)
# 3. Фильтрация
chapters = _filter_chapters(manga.chapters, chapters_filter)
logger.info("Будет скачано глав: {}", len(chapters))
# 4. Форматы
formats: list[ExportFormat] = ["cbz", "pdf", "epub"] if fmt == "all" else [fmt]
# 5. Скачиваем каждую главу
with tqdm(total=len(chapters), desc="Главы", unit="гл") as pbar:
for ch in chapters:
pbar.set_description(f"Глава {ch.number}: {ch.title[:30]}")
# Проверяем статус (resume)
if resume and db.chapter_status(ch.url) == "done":
logger.info("Пропускаем (уже скачана): {}", ch.title)
pbar.update(1)
continue
await _process_chapter(
bm=bm, ctx=ctx, ch=ch,
manga_url=url,
manga_dir=manga_dir, formats=formats,
concurrency=concurrency, db=db,
)
pbar.update(1)
logger.info("✅ Готово! Файлы в: {}", manga_dir)
await ctx.close()
db.close()
async def _process_chapter(bm, ctx, ch: Chapter, manga_url: str, manga_dir: Path,
formats: list, concurrency: int, db: StateDB):
# Новая страница для каждой главы (чистый контекст)
ch_page = await ctx.new_page()
try:
with tempfile.TemporaryDirectory() as tmpdir:
tmp_path = Path(tmpdir)
# Открываем главу и скачиваем изображения за один проход
image_paths = await get_chapter_images_and_download(
ch_page, ch.url, dest_dir=tmp_path, manga_url=manga_url
)
if not image_paths:
logger.error("Нет скачанных изображений: {}", ch.title)
db.mark_failed(ch.url)
return
ch_name = _safe_chapter_name(ch)
for fmt in formats:
out_file = manga_dir / f"{ch_name}.{fmt}"
try:
export(image_paths, out_file, fmt, manga_dir.name, ch.title)
db.mark_done(ch.url, fmt, str(out_file))
except Exception as e:
logger.error("Ошибка экспорта {}: {}", fmt, e)
except Exception as e:
logger.error("Ошибка обработки главы {}: {}", ch.title, e)
db.mark_failed(ch.url)
finally:
await ch_page.close()
# ── analyze ───────────────────────────────────
@cli.command()
@click.argument("url")
@click.pass_context
def analyze(ctx, url):
"""Анализировать сайт и вывести список глав (без скачивания)."""
asyncio.run(_analyze(url))
async def _analyze(url: str):
async with BrowserManager(headless=True) as bm:
_, page = await bm.new_page()
manga = await get_manga_info(page, url)
if not manga:
click.echo("Не удалось получить информацию")
return
click.echo(f"\n📚 Манга: {manga.title}")
click.echo(f"🔗 URL: {manga.url}")
click.echo(f"📖 Глав: {len(manga.chapters)}\n")
for ch in manga.chapters[:20]:
click.echo(f" Том {ch.volume:02d} Гл. {ch.number:06.1f} {ch.title}")
if len(manga.chapters) > 20:
click.echo(f" ... и ещё {len(manga.chapters) - 20} глав")
# Проверяем одну главу
if manga.chapters:
first = manga.chapters[-1]
click.echo(f"\n🔍 Проверяем первую главу: {first.url}")
import tempfile
with tempfile.TemporaryDirectory() as tmp:
paths = await get_chapter_images_and_download(
page, first.url, dest_dir=Path(tmp), manga_url=url
)
click.echo(f" Скачано изображений: {len(paths)}")
for p in paths[:3]:
click.echo(f" {p.name} ({p.stat().st_size} байт)")
# ── Утилиты ───────────────────────────────────
def _safe_name(s: str) -> str:
return re.sub(r'[^\w\s\-]', '', s).strip().replace(" ", "_")[:80]
def _safe_chapter_name(ch: Chapter) -> str:
vol = f"v{ch.volume:02d}_" if ch.volume else ""
return f"{vol}ch{ch.number:06.1f}"
def _filter_chapters(chapters: list[Chapter], filter_str: str | None) -> list[Chapter]:
if not filter_str:
return chapters
# "1-10" → диапазон
m = re.match(r"^(\d+(?:\.\d+)?)-(\d+(?:\.\d+)?)$", filter_str)
if m:
lo, hi = float(m.group(1)), float(m.group(2))
return [c for c in chapters if lo <= c.number <= hi]
# "1,3,7" → список
nums = {float(x.strip()) for x in filter_str.split(",")}
return [c for c in chapters if c.number in nums]
if __name__ == "__main__":
cli()

106
src/downloader.py Normal file
View File

@@ -0,0 +1,106 @@
"""
Загрузчик изображений через Playwright response interception.
Браузер сам загружает все картинки (умеет в CDN 300-редиректы).
Мы перехватываем байты прямо из сетевых ответов.
"""
import asyncio
import re
from pathlib import Path
from typing import Optional
from loguru import logger
from playwright.async_api import Page, Response
async def download_images(
context, # BrowserContext (не используется, для совместимости)
image_urls: list[str],
dest_dir: Path,
concurrency: int = 4,
chapter_url: str = "https://3.readmanga.ru/",
page: Optional[Page] = None,
) -> list[Path]:
"""
Перехватывает и сохраняет изображения которые браузер уже загрузил.
Затем доскачивает оставшиеся через скролл страницы.
"""
if page is None:
raise ValueError("page обязателен")
dest_dir.mkdir(parents=True, exist_ok=True)
def _base(url: str) -> str:
return url.split("?")[0]
url_to_idx = {_base(u): i for i, u in enumerate(image_urls)}
saved: dict[int, Path] = {}
async def _capture_response(response: Response):
base = _base(response.url)
if base not in url_to_idx:
return
idx = url_to_idx[base]
if idx in saved:
return
ct = response.headers.get("content-type", "")
if not any(t in ct for t in ("image/", "application/octet")):
if not re.search(r"\.(jpg|jpeg|png|webp)", base, re.I):
return
if response.status not in (200, 206):
return
try:
body = await response.body()
if not body:
return
ext = _get_ext(response.url)
path = dest_dir / f"{idx:04d}{ext}"
path.write_bytes(body)
saved[idx] = path
logger.debug("Перехвачена стр. {} ({} байт)", idx + 1, len(body))
except Exception as e:
logger.debug("Не удалось захватить стр. {}: {}", idx + 1, e)
page.on("response", _capture_response)
await _scroll_to_trigger_load(page)
for _ in range(30):
await asyncio.sleep(1)
if len(saved) >= len(image_urls):
break
page.remove_listener("response", _capture_response)
logger.info("Перехвачено изображений: {}/{}", len(saved), len(image_urls))
paths = []
for idx in range(len(image_urls)):
if idx in saved:
paths.append(saved[idx])
else:
logger.warning("Страница {} не была загружена браузером", idx + 1)
return paths
async def _scroll_to_trigger_load(page: Page):
try:
height = await page.evaluate("document.body.scrollHeight")
step = 600
pos = 0
while pos < height:
await page.evaluate(f"window.scrollTo(0, {pos})")
await asyncio.sleep(0.15)
pos += step
height = await page.evaluate("document.body.scrollHeight")
await page.evaluate("window.scrollTo(0, 0)")
except Exception as e:
logger.debug("Ошибка скролла: {}", e)
def _get_ext(url: str) -> str:
m = re.search(r"\.(jpg|jpeg|png|webp)(\?|$)", url, re.IGNORECASE)
if m:
ext = m.group(1).lower()
return ".jpg" if ext == "jpeg" else f".{ext}"
return ".jpg"

127
src/exporter.py Normal file
View File

@@ -0,0 +1,127 @@
"""
Экспорт в CBZ, PDF, EPUB.
"""
import zipfile
from pathlib import Path
from typing import Literal
from loguru import logger
ExportFormat = Literal["cbz", "pdf", "epub"]
def export(
image_paths: list[Path],
output_path: Path,
fmt: ExportFormat,
title: str = "Manga",
chapter: str = "",
):
output_path.parent.mkdir(parents=True, exist_ok=True)
logger.info("Экспортирую {} страниц → {} ({})", len(image_paths), output_path.name, fmt)
if fmt == "cbz":
_export_cbz(image_paths, output_path)
elif fmt == "pdf":
_export_pdf(image_paths, output_path)
elif fmt == "epub":
_export_epub(image_paths, output_path, title, chapter)
else:
raise ValueError(f"Неизвестный формат: {fmt}")
logger.info("Сохранено: {}", output_path)
# ── CBZ ───────────────────────────────────────
def _export_cbz(images: list[Path], out: Path):
with zipfile.ZipFile(out, "w", compression=zipfile.ZIP_DEFLATED) as zf:
for i, img in enumerate(images):
zf.write(img, f"{i:04d}{img.suffix}")
# ── PDF ───────────────────────────────────────
def _export_pdf(images: list[Path], out: Path):
try:
import img2pdf
with open(out, "wb") as f:
f.write(img2pdf.convert([str(p) for p in images]))
except Exception as e:
logger.warning("img2pdf не сработал ({}), использую Pillow", e)
_export_pdf_pillow(images, out)
def _export_pdf_pillow(images: list[Path], out: Path):
from PIL import Image
pil_images = []
for p in images:
img = Image.open(p).convert("RGB")
pil_images.append(img)
if pil_images:
pil_images[0].save(
out,
save_all=True,
append_images=pil_images[1:],
format="PDF",
)
# ── EPUB ──────────────────────────────────────
def _export_epub(images: list[Path], out: Path, title: str, chapter: str):
from ebooklib import epub
from PIL import Image
import base64
book = epub.EpubBook()
book.set_identifier(f"manga-{title}-{chapter}".replace(" ", "-"))
book.set_title(f"{title}{chapter}" if chapter else title)
book.set_language("ru")
spine = ["nav"]
toc = []
for i, img_path in enumerate(images):
# Добавляем изображение в книгу
with open(img_path, "rb") as f:
img_data = f.read()
img_name = f"images/page_{i:04d}{img_path.suffix}"
epub_img = epub.EpubImage()
epub_img.file_name = img_name
epub_img.media_type = _mime(img_path.suffix)
epub_img.content = img_data
book.add_item(epub_img)
# HTML-страница для каждого изображения
page_html = epub.EpubHtml(
title=f"Страница {i + 1}",
file_name=f"page_{i:04d}.xhtml",
lang="ru",
)
page_html.content = (
f'<html><body style="margin:0;padding:0;">'
f'<img src="{img_name}" style="max-width:100%;height:auto;display:block;margin:auto;"/>'
f'</body></html>'
)
book.add_item(page_html)
spine.append(page_html)
toc.append(epub.Link(f"page_{i:04d}.xhtml", f"Страница {i + 1}", f"page{i}"))
book.toc = toc
book.spine = spine
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
epub.write_epub(str(out), book)
def _mime(ext: str) -> str:
return {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".webp": "image/webp",
}.get(ext.lower(), "image/jpeg")

364
src/scraper.py Normal file
View File

@@ -0,0 +1,364 @@
"""
Парсер readmanga.ru: список глав и URL/байты изображений внутри главы.
"""
import asyncio
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
from loguru import logger
from playwright.async_api import Page
from .browser import BrowserManager
# ──────────────────────────────────────────────
# Модели данных
# ──────────────────────────────────────────────
@dataclass
class Chapter:
title: str
url: str
number: float = 0.0
volume: int = 0
@dataclass
class MangaInfo:
title: str
url: str
chapters: list[Chapter] = field(default_factory=list)
# ──────────────────────────────────────────────
# Страница манги — список глав
# ──────────────────────────────────────────────
async def get_manga_info(page: Page, url: str) -> Optional[MangaInfo]:
"""Открывает страницу манги и возвращает список всех глав."""
logger.info("Загружаем страницу манги: {}", url)
ok = await _navigate(page, url)
if not ok:
return None
title = await page.title()
title = re.sub(r"\s*[-|].*$", "", title).strip()
logger.info("Манга: {}", title)
await _expand_chapters(page)
chapters = await _extract_chapters(page)
if not chapters:
chapters = await _extract_chapters_alt(page)
logger.info("Найдено глав: {}", len(chapters))
return MangaInfo(title=title, url=url, chapters=chapters)
async def _navigate(page: Page, url: str, retries: int = 3,
referer: str | None = None) -> bool:
from urllib.parse import urlparse
if referer is None:
p = urlparse(url)
referer = f"{p.scheme}://{p.netloc}/"
for attempt in range(1, retries + 1):
try:
resp = await page.goto(url, wait_until="domcontentloaded",
timeout=60_000, referer=referer)
if resp and resp.status >= 400:
logger.warning("Попытка {}/{}: HTTP {}", attempt, retries, resp.status)
await asyncio.sleep(3 * attempt)
continue
try:
await page.wait_for_load_state("networkidle", timeout=10_000)
except Exception:
pass
return True
except Exception as e:
logger.warning("Попытка {}/{}: {}", attempt, retries, e)
await asyncio.sleep(3 * attempt)
return False
async def _expand_chapters(page: Page):
for sel in ["a.chapter-link.all", "button:has-text('Все главы')",
"a:has-text('Все главы')"]:
try:
el = page.locator(sel).first
if await el.is_visible(timeout=2000):
await el.click()
await page.wait_for_load_state("networkidle", timeout=10_000)
return
except Exception:
pass
async def _extract_chapters(page: Page) -> list[Chapter]:
"""Основной парсер: #chapters-list → tr.item-row → td[data-num] a.chapter-link"""
rows = await page.query_selector_all("#chapters-list tr.item-row")
chapters = []
for row in rows:
link = await row.query_selector("td[class*='item-title'] a")
if not link:
continue
href = await link.get_attribute("href") or ""
text = (await link.inner_text()).strip()
if not href:
continue
td = await row.query_selector("td[data-num]")
vol = int((await td.get_attribute("data-vol") or "0")) if td else 0
num_raw = int((await td.get_attribute("data-num") or "0")) if td else 0
number = num_raw / 10.0
full_url = href if href.startswith("http") else _base_url(page.url) + href
chapters.append(Chapter(title=text, url=full_url, number=number, volume=vol))
return chapters
async def _extract_chapters_alt(page: Page) -> list[Chapter]:
result = await page.evaluate("""
() => {
const links = Array.from(document.querySelectorAll('a[href*="/vol"]'));
return links.map(a => ({ href: a.href, text: a.textContent.trim() }))
.filter(x => x.href && x.text);
}
""")
return [Chapter(title=x["text"], url=x["href"],
number=_parse_num(x["text"]), volume=_parse_vol(x["text"]))
for x in result]
def _base_url(url: str) -> str:
m = re.match(r"(https?://[^/]+)", url)
return m.group(1) if m else "https://readmanga.ru"
def _parse_num(text: str) -> float:
m = re.search(r"[\d]+(?:[.,]\d+)?", text.replace(",", "."))
return float(m.group()) if m else 0.0
def _parse_vol(text: str) -> int:
m = re.search(r"Том\s+(\d+)", text, re.IGNORECASE)
return int(m.group(1)) if m else 0
# ──────────────────────────────────────────────
# Страница главы — получение URL изображений
# ──────────────────────────────────────────────
async def _extract_images_from_js(page: Page) -> list[str]:
"""
Извлекает URL из rm_h.readerInit(chapterInfo, [[base, '', path, w, h], ...]).
Считает скобки для точного захвата массива.
"""
try:
result = await page.evaluate("""
() => {
for (const s of document.querySelectorAll('script')) {
const text = s.textContent || '';
const mi = text.indexOf('readerInit');
if (mi === -1) continue;
const ai = text.indexOf('[', mi);
if (ai === -1) continue;
let depth = 0, end = -1;
for (let i = ai; i < text.length; i++) {
if (text[i] === '[') depth++;
else if (text[i] === ']') { depth--; if (!depth) { end = i+1; break; } }
}
if (end === -1) continue;
try {
const arr = eval(text.slice(ai, end));
if (Array.isArray(arr) && arr.length)
return arr.map(item => Array.isArray(item) && item.length >= 3
? item[0] + item[2] : null).filter(Boolean);
} catch(e) {}
}
return [];
}
""")
if result:
logger.debug("JS readerInit нашёл {} изображений", len(result))
return result or []
except Exception as e:
logger.debug("JS-метод не сработал: {}", e)
return []
async def _extract_images_from_dom(page: Page) -> list[str]:
try:
result = await page.evaluate("""
() => {
for (const sel of ['img.manga-page', '.page-image img', '#mangaReader img', 'img[data-src]']) {
const found = Array.from(document.querySelectorAll(sel));
if (found.length) return found.map(i => i.src || i.dataset.src).filter(Boolean);
}
return [];
}
""")
return result or []
except Exception:
return []
def _get_ext(url: str) -> str:
m = re.search(r"\.(jpg|jpeg|png|webp)(\?|$)", url, re.IGNORECASE)
if m:
ext = m.group(1).lower()
return ".jpg" if ext == "jpeg" else f".{ext}"
return ".jpg"
# ──────────────────────────────────────────────
# Скачивание главы
# ──────────────────────────────────────────────
async def get_chapter_images_and_download(
page: Page,
chapter_url: str,
dest_dir: Path,
manga_url: str | None = None,
) -> list[Path]:
"""
1. Открывает страницу главы (устанавливает DDoS-Guard cookies для CDN).
2. Извлекает список URL из readerInit.
3. Перехватывает img-запросы через page.route() + route.fetch()
(браузерный стек — правильные Sec-Fetch-* заголовки, cookies).
4. Пролистывает читалку клавишей ArrowRight чтобы загрузить все страницы.
"""
logger.info("Загружаем главу: {}", chapter_url)
from urllib.parse import urlparse
parsed = urlparse(chapter_url)
parts = parsed.path.strip("/").split("/")
manga_slug = parts[0] if parts else ""
referer = manga_url or f"{parsed.scheme}://{parsed.netloc}/{manga_slug}"
load_url = chapter_url + ("?mtr=1" if "?" not in chapter_url else "&mtr=1")
dest_dir.mkdir(parents=True, exist_ok=True)
def _base(u: str) -> str:
return u.split("?")[0]
# CDN домены которые хостят изображения манги (не статику сайта)
CDN_RE = re.compile(r"(?<!\bstatic\b)(^|[./])one-way\.work|staticfa\.|cdnmanga|reimg", re.I)
IMG_RE = re.compile(r"\.(jpg|jpeg|png|webp)(\?|$)", re.I)
# Более точный фильтр: только image-хосты, не resrmr/статика
def _is_manga_image(url: str) -> bool:
base = _base(url)
if not IMG_RE.search(base):
return False
# Исключаем статику сайта (логотипы, иконки, шрифты)
if "resrmr." in url or "/static/" in url:
return False
# Принимаем image CDN
return bool(re.search(r"one-way\.work|staticfa\.|rm\.one-way|cdnmanga|reimg", url, re.I))
captured: dict[str, bytes] = {} # base_url → bytes
lock = asyncio.Lock()
async def route_handler(route, request):
url = request.url
base = _base(url)
if not _is_manga_image(url):
await route.continue_()
return
# Уже есть — пропускаем
async with lock:
already = base in captured
if already:
await route.continue_()
return
try:
response = await route.fetch()
body = await response.body()
if body and len(body) > 500 and response.status in (200, 206):
async with lock:
if base not in captured:
captured[base] = body
logger.debug("{}: {} байт", base.split("/")[-1], len(body))
await route.fulfill(response=response)
except Exception as e:
logger.debug("route.fetch {}: {}", base[-40:], e)
try:
await route.continue_()
except Exception:
pass
await page.route("**/*", route_handler)
# 1. Открываем главу
ok = await _navigate(page, load_url, referer=referer)
if not ok:
await page.unroute("**/*", route_handler)
logger.error("Не удалось открыть главу: {}", chapter_url)
return []
# 2. Ждём readerInit
try:
await page.wait_for_function(
"() => Array.from(document.querySelectorAll('script'))"
".some(s => s.textContent.includes('readerInit'))",
timeout=15_000,
)
except Exception:
logger.debug("readerInit не появился за 15с")
# 3. Извлекаем список URL
image_urls = await _extract_images_from_js(page)
if not image_urls:
image_urls = await _extract_images_from_dom(page)
if not image_urls:
await page.unroute("**/*", route_handler)
logger.error("Список изображений пуст: {}", chapter_url)
return []
logger.info("Найдено изображений: {}", len(image_urls))
url_to_idx = {_base(u): i for i, u in enumerate(image_urls)}
total = len(image_urls)
# 4. Пролистываем читалку — reader грузит страницы по мере листания
await asyncio.sleep(1)
for i in range(total + 10):
async with lock:
done = len(captured)
if done >= total:
break
try:
await page.keyboard.press("ArrowRight")
await asyncio.sleep(0.5)
except Exception:
break
if i % 20 == 19:
async with lock:
done = len(captured)
logger.debug("Пролистано {}, загружено: {}/{}", i + 1, done, total)
# Финальное ожидание
await asyncio.sleep(3)
await page.unroute("**/*", route_handler)
async with lock:
done = len(captured)
logger.info("Перехвачено: {}/{}", done, total)
# 5. Сохраняем в правильном порядке
paths: dict[int, Path] = {}
for base_url, body in captured.items():
if base_url not in url_to_idx:
continue
idx = url_to_idx[base_url]
ext = _get_ext(base_url)
p = dest_dir / f"{idx:04d}{ext}"
p.write_bytes(body)
paths[idx] = p
missing = total - len(paths)
if missing:
logger.warning("Не загружено страниц: {}", missing)
return [paths[i] for i in sorted(paths.keys())]

92
src/state.py Normal file
View File

@@ -0,0 +1,92 @@
"""
Хранение состояния скачивания в SQLite.
"""
import sqlite3
from datetime import datetime
from pathlib import Path
from typing import Optional
DB_PATH = Path("/app/state/progress.db")
class StateDB:
def __init__(self, db_path: Path = DB_PATH):
db_path.parent.mkdir(parents=True, exist_ok=True)
self.conn = sqlite3.connect(str(db_path))
self._init()
def _init(self):
self.conn.execute("""
CREATE TABLE IF NOT EXISTS chapters (
id INTEGER PRIMARY KEY AUTOINCREMENT,
manga_url TEXT NOT NULL,
chapter_url TEXT NOT NULL UNIQUE,
title TEXT,
number REAL,
volume INTEGER,
status TEXT DEFAULT 'pending',
output_cbz TEXT,
output_pdf TEXT,
output_epub TEXT,
updated_at TEXT
)
""")
self.conn.commit()
def upsert_chapter(self, manga_url: str, chapter_url: str,
title: str = "", number: float = 0, volume: int = 0):
self.conn.execute("""
INSERT INTO chapters (manga_url, chapter_url, title, number, volume, updated_at)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT(chapter_url) DO UPDATE SET
title = excluded.title,
number = excluded.number,
volume = excluded.volume
""", (manga_url, chapter_url, title, number, volume, _now()))
self.conn.commit()
def mark_done(self, chapter_url: str, fmt: str, output_path: str):
col = f"output_{fmt}"
self.conn.execute(f"""
UPDATE chapters SET status='done', {col}=?, updated_at=?
WHERE chapter_url=?
""", (output_path, _now(), chapter_url))
self.conn.commit()
def mark_failed(self, chapter_url: str):
self.conn.execute("""
UPDATE chapters SET status='failed', updated_at=? WHERE chapter_url=?
""", (_now(), chapter_url))
self.conn.commit()
def get_pending(self, manga_url: str) -> list[dict]:
cur = self.conn.execute("""
SELECT chapter_url, title, number, volume
FROM chapters
WHERE manga_url=? AND status != 'done'
ORDER BY volume, number
""", (manga_url,))
cols = [d[0] for d in cur.description]
return [dict(zip(cols, row)) for row in cur.fetchall()]
def get_all(self, manga_url: str) -> list[dict]:
cur = self.conn.execute("""
SELECT * FROM chapters WHERE manga_url=? ORDER BY volume, number
""", (manga_url,))
cols = [d[0] for d in cur.description]
return [dict(zip(cols, row)) for row in cur.fetchall()]
def chapter_status(self, chapter_url: str) -> Optional[str]:
cur = self.conn.execute(
"SELECT status FROM chapters WHERE chapter_url=?", (chapter_url,))
row = cur.fetchone()
return row[0] if row else None
def close(self):
self.conn.close()
def _now() -> str:
return datetime.utcnow().isoformat()