Base app
This commit is contained in:
380
src/worker.py
Normal file
380
src/worker.py
Normal file
@@ -0,0 +1,380 @@
|
||||
"""
|
||||
Воркер скачивания манги с поддержкой событий прогресса.
|
||||
"""
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from .browser import BrowserManager
|
||||
from .scraper import get_manga_info, get_chapter_images_and_download, Chapter
|
||||
from .exporter import export, MangaMeta
|
||||
from .state import StateDB
|
||||
|
||||
OUTPUT_DIR = Path("/app/output")
|
||||
|
||||
# Читаем из переменных окружения; можно переопределить в docker-compose
|
||||
CHAPTER_CONCURRENCY = int(os.getenv("CHAPTER_CONCURRENCY", "3"))
|
||||
|
||||
|
||||
def _safe_name(s: str) -> str:
|
||||
return re.sub(r'[^\w\s\-]', '', s).strip().replace(" ", "_")[:80]
|
||||
|
||||
|
||||
def _safe_chapter_name(ch: Chapter) -> str:
|
||||
vol = f"v{ch.volume:02d}_" if ch.volume else ""
|
||||
return f"{vol}ch{ch.number:06.1f}"
|
||||
|
||||
|
||||
async def download_manga(
|
||||
url: str,
|
||||
fmt: str = "cbz",
|
||||
output_dir: Path = OUTPUT_DIR,
|
||||
resume: bool = True,
|
||||
is_update: bool = False,
|
||||
on_event: Optional[Callable] = None,
|
||||
chapter_concurrency: int = CHAPTER_CONCURRENCY,
|
||||
):
|
||||
"""Скачать мангу. Главы обрабатываются параллельно (chapter_concurrency штук)."""
|
||||
|
||||
async def emit(event: dict):
|
||||
if on_event:
|
||||
try:
|
||||
await on_event(event)
|
||||
except Exception as e:
|
||||
logger.debug("on_event error: {}", e)
|
||||
|
||||
db = StateDB()
|
||||
db_lock = asyncio.Lock() # защита от параллельных записей в SQLite
|
||||
|
||||
async def db_call(fn, *args, **kwargs):
|
||||
"""Обёртка: все обращения к db идут через общий asyncio.Lock."""
|
||||
async with db_lock:
|
||||
return fn(*args, **kwargs)
|
||||
|
||||
try:
|
||||
await db_call(db.update_manga_status, url, "downloading")
|
||||
started_ts = await db_call(db.mark_started, url)
|
||||
await emit({"type": "manga_start", "url": url, "started_at": started_ts})
|
||||
|
||||
async with BrowserManager(headless=True) as bm:
|
||||
ctx, info_page = await bm.new_page()
|
||||
|
||||
manga = await get_manga_info(info_page, url)
|
||||
await info_page.close()
|
||||
|
||||
if not manga:
|
||||
await db_call(db.update_manga_status, url, "failed")
|
||||
await emit({"type": "manga_failed", "url": url,
|
||||
"error": "Не удалось получить информацию о манге"})
|
||||
return
|
||||
|
||||
await db_call(
|
||||
db.update_manga_info,
|
||||
url,
|
||||
title=manga.title_ru or manga.title,
|
||||
chapters_total=len(manga.chapters),
|
||||
title_ru=manga.title_ru,
|
||||
title_full=manga.title_full,
|
||||
pub_status=manga.pub_status,
|
||||
)
|
||||
await emit({
|
||||
"type": "manga_info",
|
||||
"url": url,
|
||||
"title": manga.title_ru or manga.title,
|
||||
"title_ru": manga.title_ru,
|
||||
"title_full": manga.title_full,
|
||||
"pub_status": manga.pub_status,
|
||||
"chapters_total": len(manga.chapters),
|
||||
})
|
||||
|
||||
folder_name = _safe_name(manga.title_ru or manga.title)
|
||||
manga_dir = output_dir / folder_name
|
||||
manga_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for ch in manga.chapters:
|
||||
await db_call(db.upsert_chapter, url, ch.url, ch.title, ch.number, ch.volume)
|
||||
|
||||
formats = ["cbz", "pdf", "epub"] if fmt == "all" else [fmt]
|
||||
|
||||
# ── Разделяем главы: пропустить / скачать ────────────────────
|
||||
to_skip = []
|
||||
to_download = []
|
||||
for ch in manga.chapters:
|
||||
if resume and (await db_call(db.chapter_status, ch.url)) == "done":
|
||||
to_skip.append(ch)
|
||||
else:
|
||||
to_download.append(ch)
|
||||
|
||||
# Счётчик и блокировка для безопасного обновления из параллельных задач
|
||||
counter_lock = asyncio.Lock()
|
||||
# Начинаем с 0: to_skip-цикл сам доберёт до len(to_skip),
|
||||
# иначе sync_chapters_done() + len(to_skip) = двойной счёт
|
||||
chapters_done = 0
|
||||
|
||||
# Сообщаем о пропущенных главах (уже скачаны)
|
||||
for ch in to_skip:
|
||||
chapters_done += 1
|
||||
await emit({
|
||||
"type": "chapter_skipped",
|
||||
"url": url,
|
||||
"chapter_url": ch.url,
|
||||
"chapter_number": ch.number,
|
||||
"chapter_title": ch.title,
|
||||
"volume": ch.volume,
|
||||
"chapters_done": chapters_done,
|
||||
"chapters_total": len(manga.chapters),
|
||||
})
|
||||
|
||||
logger.info(
|
||||
"Параллельность: {} гл одновременно. Пропущено: {}, скачать: {}",
|
||||
chapter_concurrency, len(to_skip), len(to_download),
|
||||
)
|
||||
|
||||
# ── Семафор ограничивает одновременно открытые страницы ───────
|
||||
sem = asyncio.Semaphore(chapter_concurrency)
|
||||
|
||||
async def process_chapter(ch: Chapter) -> None:
|
||||
nonlocal chapters_done
|
||||
async with sem:
|
||||
# Повторная проверка (другая горутина могла скачать)
|
||||
if (await db_call(db.chapter_status, ch.url)) == "done":
|
||||
async with counter_lock:
|
||||
chapters_done += 1
|
||||
done_snap = chapters_done
|
||||
await emit({
|
||||
"type": "chapter_skipped",
|
||||
"url": url,
|
||||
"chapter_url": ch.url,
|
||||
"chapter_number": ch.number,
|
||||
"chapter_title": ch.title,
|
||||
"volume": ch.volume,
|
||||
"chapters_done": done_snap,
|
||||
"chapters_total": len(manga.chapters),
|
||||
})
|
||||
return
|
||||
|
||||
await emit({
|
||||
"type": "chapter_start",
|
||||
"url": url,
|
||||
"chapter_url": ch.url,
|
||||
"chapter_title": ch.title,
|
||||
"chapter_number": ch.number,
|
||||
"volume": ch.volume,
|
||||
"chapters_done": chapters_done,
|
||||
"chapters_total": len(manga.chapters),
|
||||
})
|
||||
|
||||
ch_page = await ctx.new_page()
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmp_path = Path(tmpdir)
|
||||
pages_done_count = [0]
|
||||
|
||||
async def on_page(page_idx: int, pages_total: int):
|
||||
pages_done_count[0] += 1
|
||||
await db_call(db.update_chapter_pages,
|
||||
ch.url, pages_total, pages_done_count[0])
|
||||
await emit({
|
||||
"type": "page_done",
|
||||
"url": url,
|
||||
"chapter_url": ch.url,
|
||||
"page_idx": page_idx,
|
||||
"pages_done": pages_done_count[0],
|
||||
"pages_total": pages_total,
|
||||
})
|
||||
|
||||
image_paths = await get_chapter_images_and_download(
|
||||
ch_page, ch.url,
|
||||
dest_dir=tmp_path,
|
||||
manga_url=url,
|
||||
on_page=on_page,
|
||||
)
|
||||
|
||||
if not image_paths:
|
||||
logger.error(
|
||||
"Т{} Гл.{} '{}' — get_chapter_images вернул пустой список. "
|
||||
"URL: {}",
|
||||
ch.volume, ch.number, ch.title, ch.url,
|
||||
)
|
||||
await db_call(db.mark_failed, ch.url)
|
||||
await emit({"type": "chapter_failed", "url": url,
|
||||
"chapter_url": ch.url})
|
||||
return
|
||||
|
||||
ch_name = _safe_chapter_name(ch)
|
||||
ch_meta = MangaMeta(
|
||||
series=manga.title_ru or manga.title,
|
||||
series_full=manga.title_full or "",
|
||||
chapter_title=ch.title,
|
||||
number=ch.number,
|
||||
volume=ch.volume,
|
||||
chapters_total=len(manga.chapters),
|
||||
pub_status=manga.pub_status,
|
||||
source_url=url,
|
||||
summary=manga.description,
|
||||
genre=", ".join(manga.genres) if manga.genres else "",
|
||||
)
|
||||
for f in formats:
|
||||
out_file = manga_dir / f"{ch_name}.{f}"
|
||||
try:
|
||||
export(image_paths, out_file, f, meta=ch_meta)
|
||||
await db_call(db.mark_done, ch.url, f, str(out_file))
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
"Ошибка экспорта Т{} Гл.{} → {} | {}: {}",
|
||||
ch.volume, ch.number, f, out_file.name, e,
|
||||
)
|
||||
|
||||
event_type = "auto_downloaded" if is_update else "downloaded"
|
||||
await db_call(
|
||||
db.add_history,
|
||||
manga_url=url,
|
||||
event_type=event_type,
|
||||
chapter_url=ch.url,
|
||||
chapter_title=ch.title,
|
||||
chapter_number=ch.number,
|
||||
volume=ch.volume,
|
||||
)
|
||||
|
||||
async with counter_lock:
|
||||
chapters_done += 1
|
||||
done_snap = chapters_done
|
||||
|
||||
await emit({
|
||||
"type": "chapter_done",
|
||||
"url": url,
|
||||
"chapter_url": ch.url,
|
||||
"chapter_title": ch.title,
|
||||
"chapter_number": ch.number,
|
||||
"volume": ch.volume,
|
||||
"chapters_done": done_snap,
|
||||
"chapters_total": len(manga.chapters),
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
"Необработанное исключение в Т{} Гл.{} '{}' | {}: {}",
|
||||
ch.volume, ch.number, ch.title, ch.url, e,
|
||||
)
|
||||
await db_call(db.mark_failed, ch.url)
|
||||
await emit({"type": "chapter_failed", "url": url,
|
||||
"chapter_url": ch.url, "error": str(e)})
|
||||
finally:
|
||||
await ch_page.close()
|
||||
|
||||
# ── Запускаем все задачи сразу; семафор дозирует параллельность ──
|
||||
tasks = [process_chapter(ch) for ch in to_download]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Логируем неожиданные исключения из gather
|
||||
for ch, res in zip(to_download, results):
|
||||
if isinstance(res, Exception) and not isinstance(res, asyncio.CancelledError):
|
||||
logger.exception(
|
||||
"gather: необработанное исключение Т{} Гл.{} '{}': {}",
|
||||
ch.volume, ch.number, ch.title, res,
|
||||
)
|
||||
|
||||
real_done = await db_call(db.sync_chapters_done, url)
|
||||
await db_call(db.update_manga_status, url, "done")
|
||||
finished_ts = await db_call(db.mark_finished, url)
|
||||
await db_call(db.set_last_checked, url)
|
||||
await emit({
|
||||
"type": "manga_done",
|
||||
"url": url,
|
||||
"chapters_done": real_done,
|
||||
"chapters_total": len(manga.chapters),
|
||||
"finished_at": finished_ts,
|
||||
})
|
||||
await ctx.close()
|
||||
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Manga worker error {}: {}", url, e)
|
||||
await db_call(db.update_manga_status, url, "failed")
|
||||
finished_ts = await db_call(db.mark_finished, url)
|
||||
await emit({"type": "manga_failed", "url": url, "error": str(e), "finished_at": finished_ts})
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
async def check_for_updates(
|
||||
url: str,
|
||||
on_event: Optional[Callable] = None,
|
||||
) -> list[str]:
|
||||
"""
|
||||
Проверяет наличие новых глав для манги.
|
||||
Возвращает список новых chapter_url.
|
||||
"""
|
||||
async def emit(event: dict):
|
||||
if on_event:
|
||||
try:
|
||||
await on_event(event)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
db = StateDB()
|
||||
try:
|
||||
db.set_last_checked(url)
|
||||
db.add_history(manga_url=url, event_type="check_started")
|
||||
await emit({"type": "check_started", "url": url})
|
||||
|
||||
async with BrowserManager(headless=True) as bm:
|
||||
_, page = await bm.new_page()
|
||||
manga = await get_manga_info(page, url)
|
||||
await page.close()
|
||||
if not manga:
|
||||
return []
|
||||
|
||||
# Обновляем pub_status и количество глав
|
||||
db.update_manga_info(
|
||||
url,
|
||||
title=manga.title_ru or manga.title,
|
||||
chapters_total=len(manga.chapters),
|
||||
title_ru=manga.title_ru,
|
||||
title_full=manga.title_full,
|
||||
pub_status=manga.pub_status,
|
||||
)
|
||||
|
||||
# Находим главы которых ещё нет в БД
|
||||
known = {ch["chapter_url"] for ch in db.get_all_chapters(url)}
|
||||
new_chapters = [ch for ch in manga.chapters if ch.url not in known]
|
||||
|
||||
for ch in new_chapters:
|
||||
db.upsert_chapter(url, ch.url, ch.title, ch.number, ch.volume)
|
||||
db.add_history(
|
||||
manga_url=url,
|
||||
event_type="new_chapter_found",
|
||||
chapter_url=ch.url,
|
||||
chapter_title=ch.title,
|
||||
chapter_number=ch.number,
|
||||
volume=ch.volume,
|
||||
)
|
||||
await emit({
|
||||
"type": "new_chapter_found",
|
||||
"url": url,
|
||||
"chapter_url": ch.url,
|
||||
"chapter_title": ch.title,
|
||||
"chapter_number": ch.number,
|
||||
})
|
||||
|
||||
db.add_history(
|
||||
manga_url=url,
|
||||
event_type="check_done",
|
||||
details=f"Найдено новых: {len(new_chapters)}",
|
||||
)
|
||||
await emit({
|
||||
"type": "check_done",
|
||||
"url": url,
|
||||
"new_chapters": len(new_chapters),
|
||||
})
|
||||
|
||||
return [ch.url for ch in new_chapters]
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
Reference in New Issue
Block a user