diff --git a/frontend/index.html b/frontend/index.html index 1a80227..4ba7fbb 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -108,7 +108,7 @@

Добавить мангу

- +
- + +
+
+ + @@ -132,6 +145,8 @@ class="px-4 py-3 text-sm font-semibold border-b-2 border-transparent text-gray-400 hover:text-white">🔔 Новости +
@@ -177,6 +192,36 @@
Загрузка...
+ + + + + + + + @@ -281,6 +326,7 @@ const state = { mangas: {}, // url → manga object chapters: {}, // manga_url → [chapter, ...] filter: 'all', + sources: [], // [{id, slug, display_name, domains}] }; // ── Auth ───────────────────────────────────── @@ -407,8 +453,12 @@ function handleEvent(msg) { case 'manga_queued': if(!state.mangas[msg.url]) { + const srcInfo = msg.source_id ? (state.sources.find(s => s.id === msg.source_id) || null) : null; state.mangas[msg.url] = { url: msg.url, title: msg.url, status: 'queued', format: msg.format, - chapters_total: 0, chapters_done: 0, size_human: '—' }; + chapters_total: 0, chapters_done: 0, size_human: '—', + source: srcInfo ? {id: srcInfo.id, slug: srcInfo.slug, display_name: srcInfo.display_name} : null }; + } else { + state.mangas[msg.url].status = 'queued'; } renderList(); loadStats(); @@ -597,6 +647,25 @@ function handleEvent(msg) { renderList(); loadStats(); break; + + case 'source_unknown': + _showNotification('⚠ Источник не определён для ' + (state.mangas[msg.url]?.title || msg.url) + '. Выберите источник.', 'warn'); + if(state.mangas[msg.url]) { state.mangas[msg.url].status = 'failed'; renderList(); } + break; + + case 'source_domain_added': + case 'source_domain_removed': + loadSources(); + break; + + case 'source_switched': + if(state.mangas[msg.url]) { + // Обновляем source у манги из актуального списка источников + const newSrc = state.sources.find(s => s.id === msg.new_source_id); + if(newSrc) state.mangas[msg.url].source = {id: newSrc.id, slug: newSrc.slug, display_name: newSrc.display_name}; + updateMangaRow(msg.url); + } + break; } } @@ -604,7 +673,7 @@ function handleEvent(msg) { let newsUnreadCount = 0; function switchTab(tab) { - ['mangas', 'news', 'history'].forEach(t => { + ['mangas', 'news', 'history', 'settings'].forEach(t => { document.getElementById('tab-content-'+t).classList.toggle('hidden', t !== tab); const btn = document.getElementById('tab-'+t); btn.className = t === tab @@ -614,6 +683,8 @@ function switchTab(tab) { document.getElementById('manga-filters').classList.toggle('hidden', tab !== 'mangas'); if(tab === 'history') loadHistory(); if(tab === 'news') { newsUnreadCount = 0; updateNewsBadge(); loadNews(); } + if(tab === 'settings') loadSources(); +} } function updateNewsBadge() { @@ -773,6 +844,66 @@ async function checkNowBtn(btn, url) { } } +// ── Source detection ───────────────────────── +let _resolveTimer = null; +let _resolvedSourceId = null; // null = found via domain, undefined = unknown + +async function onUrlInputChange() { + clearTimeout(_resolveTimer); + _resolveTimer = setTimeout(_resolveSource, 400); +} + +async function _resolveSource() { + const raw = document.getElementById('url-input').value.trim(); + const hint = document.getElementById('source-hint'); + const hintFound = document.getElementById('source-hint-found'); + const hintUnknown = document.getElementById('source-hint-unknown'); + + // Берём первый непустой URL + const url = raw.split('\n').map(u=>u.trim()).filter(Boolean)[0]; + if(!url) { + hint.classList.add('hidden'); + _resolvedSourceId = null; + document.getElementById('add-btn').disabled = false; + return; + } + + try { + const r = await fetch('/api/resolve-source?url=' + encodeURIComponent(url)); + const data = await r.json(); + hint.classList.remove('hidden'); + + if(data.source) { + hintFound.classList.remove('hidden'); + hintUnknown.classList.add('hidden'); + document.getElementById('source-hint-name').textContent = data.source.display_name; + _resolvedSourceId = data.source.id; + document.getElementById('add-btn').disabled = false; + } else { + hintFound.classList.add('hidden'); + hintUnknown.classList.remove('hidden'); + _resolvedSourceId = undefined; // неизвестен — нужен ручной выбор + document.getElementById('add-btn').disabled = true; + // Заполняем список источников + const sel = document.getElementById('source-manual-select'); + sel.innerHTML = ''; + (state.sources || []).forEach(s => { + const opt = document.createElement('option'); + opt.value = s.id; + opt.textContent = s.display_name; + sel.appendChild(opt); + }); + sel.onchange = () => { + document.getElementById('add-btn').disabled = !sel.value; + }; + } + } catch(e) { + hint.classList.add('hidden'); + _resolvedSourceId = null; + document.getElementById('add-btn').disabled = false; + } +} + // ── API ────────────────────────────────────── async function loadStats() { try { @@ -788,17 +919,35 @@ async function addToQueue() { const urls = raw.split('\n').map(u=>u.trim()).filter(Boolean); if(!urls.length) return; + // Определяем source_id + let sourceId = null; + if(_resolvedSourceId === undefined) { + // Неизвестный домен — нужен ручной выбор + const manualVal = document.getElementById('source-manual-select').value; + if(!manualVal) { alert('Выберите источник для добавления манги'); return; } + sourceId = parseInt(manualVal); + } else if(_resolvedSourceId !== null) { + sourceId = _resolvedSourceId; + } + try { + const body = {urls, format: fmt}; + if(sourceId !== null) body.source_id = sourceId; const r = await fetch('/api/queue', { method:'POST', headers:{'Content-Type':'application/json'}, - body: JSON.stringify({urls, format: fmt}) + body: JSON.stringify(body) }); const data = await r.json(); const msg = document.getElementById('add-msg'); msg.textContent = `✓ Добавлено: ${data.added.length}, уже есть: ${data.skipped.length}`; msg.classList.remove('hidden'); - if(data.added.length) document.getElementById('url-input').value = ''; + if(data.added.length) { + document.getElementById('url-input').value = ''; + document.getElementById('source-hint').classList.add('hidden'); + _resolvedSourceId = null; + document.getElementById('add-btn').disabled = false; + } setTimeout(()=>msg.classList.add('hidden'), 4000); } catch(e) { alert('Ошибка: ' + e.message); @@ -825,6 +974,193 @@ async function resumeManga(url) { } } +// ── Sources ─────────────────────────────────── +async function loadSources() { + try { + const r = await fetch('/api/sources'); + if(r.ok) { + state.sources = await r.json(); + if(!document.getElementById('tab-content-settings').classList.contains('hidden')) { + renderSources(); + } + } + } catch(e) {} +} + +function renderSources() { + const container = document.getElementById('sources-list'); + if(!container) return; + if(!state.sources.length) { + container.innerHTML = '
Нет доступных источников
'; + return; + } + container.innerHTML = state.sources.map(s => ` +
+
+
+ ${escHtml(s.display_name)} + slug: ${escHtml(s.slug)} +
+
+
+ ${s.domains.map(d => ` + + ${escHtml(d)} + + + `).join('')} + + + +
+
+ `).join(''); +} + +function showAddDomain(sourceId) { + const area = document.getElementById('add-domain-area-' + sourceId); + if(!area) return; + area.innerHTML = ` + + + + + + `; + setTimeout(() => document.getElementById('new-domain-input-' + sourceId)?.focus(), 50); +} + +async function addDomain(sourceId) { + const input = document.getElementById('new-domain-input-' + sourceId); + if(!input) return; + const domain = input.value.trim().toLowerCase(); + if(!domain) return; + try { + const r = await fetch(`/api/sources/${sourceId}/domains`, { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({domain}), + }); + if(!r.ok) { + const err = await r.json(); + _showNotification('Ошибка: ' + (err.detail || 'неизвестная ошибка'), 'error'); + return; + } + await loadSources(); + } catch(e) { + _showNotification('Ошибка: ' + e.message, 'error'); + } +} + +async function removeDomain(sourceId, domain) { + if(!confirm(`Удалить домен «${domain}»?`)) return; + try { + const r = await fetch(`/api/sources/${sourceId}/domains/${encodeURIComponent(domain)}`, {method: 'DELETE'}); + if(!r.ok) { + const err = await r.json(); + _showNotification('Ошибка: ' + (err.detail || 'неизвестная ошибка'), 'error'); + return; + } + await loadSources(); + } catch(e) { + _showNotification('Ошибка: ' + e.message, 'error'); + } +} + +// ── Switch Source Modal ─────────────────────── +let _switchSourceUrl = null; + +function openSwitchSourceModal(url) { + _switchSourceUrl = url; + const manga = state.mangas[url]; + const modal = document.getElementById('switch-source-modal'); + const sel = document.getElementById('switch-source-select'); + const warning = document.getElementById('switch-source-warning'); + + document.getElementById('switch-source-current').textContent = + 'Текущий источник: ' + (manga?.source?.display_name || 'не определён'); + + sel.innerHTML = ''; + state.sources.forEach(s => { + const opt = document.createElement('option'); + opt.value = s.id; + opt.textContent = s.display_name; + if(manga?.source?.id === s.id) opt.selected = true; + sel.appendChild(opt); + }); + + try { + const domain = new URL(url).hostname.replace(/^www\./, ''); + warning.textContent = `⚠ Домен «${domain}» будет перепривязан к выбранному источнику. Это затронет все манги с этого домена.`; + warning.classList.remove('hidden'); + } catch(e) { warning.classList.add('hidden'); } + + modal.classList.remove('hidden'); + modal.classList.add('flex'); +} + +function closeSwitchSourceModal() { + _switchSourceUrl = null; + const modal = document.getElementById('switch-source-modal'); + modal.classList.add('hidden'); + modal.classList.remove('flex'); +} + +async function confirmSwitchSource() { + const url = _switchSourceUrl; + const sourceId = parseInt(document.getElementById('switch-source-select').value); + if(!url || !sourceId) return; + try { + const r = await fetch('/api/mangas/switch-source', { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({url, source_id: sourceId}), + }); + if(!r.ok) { + const err = await r.json(); + _showNotification('Ошибка: ' + (err.detail || 'неизвестная ошибка'), 'error'); + return; + } + const data = await r.json(); + closeSwitchSourceModal(); + _showNotification( + `✓ Источник изменён на «${data.source_name}»` + + (data.chapters_reset ? `. Сброшено глав: ${data.chapters_reset}` : ''), 'ok' + ); + if(state.mangas[url]) { + const src = state.sources.find(s => s.id === sourceId); + if(src) state.mangas[url].source = {id: src.id, slug: src.slug, display_name: src.display_name}; + updateMangaRow(url); + } + } catch(e) { + _showNotification('Ошибка: ' + e.message, 'error'); + } +} + +document.addEventListener('click', function(e) { + const modal = document.getElementById('switch-source-modal'); + if(modal && !modal.classList.contains('hidden') && e.target === modal) closeSwitchSourceModal(); +}); + +// ── Notification helper ─────────────────────── +function _showNotification(text, type='ok') { + const el = document.getElementById('add-msg'); + if(!el) return; + el.textContent = text; + el.style.color = type === 'error' ? '#f87171' : type === 'warn' ? '#fbbf24' : '#4ade80'; + el.classList.remove('hidden'); + setTimeout(() => el.classList.add('hidden'), 5000); +} + // ── Delete modal ───────────────────────────── let _deleteUrl = null; let _deleteFilesChecked = false; @@ -1011,6 +1347,12 @@ function pubStatusPill(s) { return `${map[s]}`; } +function _sourceBadge(source) { + if(!source) return 'Источник неизвестен'; + if(source.slug === 'unknown') return '' + escHtml(source.display_name) + ''; + return '' + escHtml(source.display_name) + ''; +} + // ── Время загрузки ──────────────────────────── // Храним интервал живого таймера: url → intervalId const _timerIntervals = {}; @@ -1126,6 +1468,7 @@ function renderMangaRow(m) {
${statusPill(m.status)} ${pubStatusPill(m.pub_status || 'unknown')} + ${_sourceBadge(m.source)} ${escHtml(m.title || m.url)}
@@ -1170,6 +1513,11 @@ function _rowButtons(m) { title="${m.errors_count} проблем при загрузке" style="background:#450a0a;color:#fca5a5;padding:4px 8px;border-radius:6px;font-size:0.75rem;cursor:pointer">⚠️ ${m.errors_count}` : ''} + ${!isActive + ? `` + : ''} ${isActive ? `` : ''} @@ -1282,6 +1630,7 @@ function _patchRow(el, m) { set('status', statusPill(m.status)); set('pubstatus', pubStatusPill(m.pub_status || 'unknown')); + set('source', _sourceBadge(m.source)); set('title', escHtml(m.title || m.url)); set('chcount', `📖 ${chDone}/${chTotal} глав`); set('size', `💾 ${m.size_human || '—'}`); @@ -1661,6 +2010,7 @@ async function saveRenameFolder() { async function initApp() { _initDeleteModal(); await loadStats(); + await loadSources(); connectWS(); // Загружаем список манги try { diff --git a/src/api.py b/src/api.py index 9aa22f7..244e7b5 100644 --- a/src/api.py +++ b/src/api.py @@ -20,6 +20,7 @@ from loguru import logger from .state import StateDB from .worker import download_manga, check_for_updates from .exporter import patch_meta, MangaMeta +from .sources import registry, get_source_for_url, extract_domain OUTPUT_DIR = Path("/app/output") FRONTEND_DIR = Path("/app/frontend") @@ -172,6 +173,16 @@ async def _queue_worker_loop(): @app.on_event("startup") async def startup_event(): + # Синхронизируем источники с кодом и мигрируем существующие манги + _db = StateDB() + try: + _db.sync_sources(registry) + migrated = _db.migrate_manga_sources() + if migrated: + logger.info("Авто-миграция: проставлен source_id для {} манг", migrated) + finally: + _db.close() + asyncio.create_task(queue_worker()) asyncio.create_task(update_scheduler()) # Восстанавливаем очередь из БД (незавершённые задачи) @@ -365,6 +376,16 @@ def _enrich_manga(m: dict, db: StateDB) -> dict: AND pages_total > 0 AND pages_done < pages_total""", (m["url"],) ).fetchone()[0] + + # Источник + source_info = None + if m.get("source_id"): + src = db.get_source_by_id(m["source_id"]) + if src: + source_info = {"id": src["id"], "slug": src["slug"], "display_name": src["display_name"]} + else: + source_info = {"id": m["source_id"], "slug": "unknown", "display_name": "Источник недоступен"} + return { **m, "chapters_done": ch_done_count, @@ -375,6 +396,7 @@ def _enrich_manga(m: dict, db: StateDB) -> dict: "errors_count": ch_failed + ch_partial, "started_at": m.get("started_at"), "finished_at": m.get("finished_at"), + "source": source_info, } @@ -454,6 +476,7 @@ def _manga_detail(manga: dict, db: StateDB) -> dict: class AddMangaRequest(BaseModel): urls: List[str] format: str = "cbz" + source_id: Optional[int] = None # явный выбор источника (для неизвестных доменов) # ── Auth API ───────────────────────────────── @@ -536,7 +559,24 @@ async def add_to_queue(body: AddMangaRequest): url = url.strip() if not url: continue - is_new = db.add_manga(url, body.format) + + # Определяем source_id: явный из запроса или авто по домену + source_id = body.source_id + if source_id is None: + domain = extract_domain(url) + source_row = db.get_source_by_domain(domain) + if source_row: + source_id = source_row["id"] + + # Если источник указан явно — привязываем домен к нему + if body.source_id is not None: + domain = extract_domain(url) + existing = db.get_source_by_domain(domain) + if existing and existing["id"] != body.source_id: + db.remove_domain(existing["id"], domain) + db.add_domain(body.source_id, domain) + + is_new = db.add_manga(url, body.format, source_id=source_id) if is_new: await download_queue.put({"url": url, "fmt": body.format}) added.append(url) @@ -544,9 +584,9 @@ async def add_to_queue(body: AddMangaRequest): "type": "manga_queued", "url": url, "format": body.format, + "source_id": source_id, }) await _broadcast_queue_positions() - # Запускаем фоновую задачу предпросмотра (без Chromium — быстро) asyncio.create_task(_fetch_preview(url)) else: skipped.append(url) @@ -559,15 +599,27 @@ async def _fetch_preview(url: str): """Быстро получает название и количество глав сразу после добавления.""" try: from .browser import BrowserManager - from .scraper import get_manga_info - async with BrowserManager(headless=True) as bm: - _, page = await bm.new_page() - manga = await get_manga_info(page, url) - if not manga: - return db = StateDB() try: - db.update_manga_info( + source = get_source_for_url(url, db) + if source is None: + manga_row = db.get_manga(url) + if manga_row and manga_row.get("source_id"): + source = registry.get_by_db_id(manga_row["source_id"], db) + finally: + db.close() + + if source is None: + return + + async with BrowserManager(headless=True) as bm: + _, page = await bm.new_page() + manga = await source.get_manga_info(page, url) + if not manga: + return + db2 = StateDB() + try: + db2.update_manga_info( url, title=manga.title_ru or manga.title, chapters_total=len(manga.chapters), @@ -576,7 +628,7 @@ async def _fetch_preview(url: str): pub_status=manga.pub_status, ) finally: - db.close() + db2.close() await ws_manager.broadcast({ "type": "manga_preview", "url": url, @@ -996,6 +1048,151 @@ async def delete_manga(url: str, delete_files: bool = False): db.close() + +# ── Sources API ─────────────────────────────── + +class DomainAdd(BaseModel): + domain: str + + +class SwitchSourceRequest(BaseModel): + url: str + source_id: int + + +@app.get("/api/sources") +async def list_sources(): + """Список всех источников с доменами.""" + db = StateDB() + try: + return db.get_all_sources() + finally: + db.close() + + +@app.get("/api/resolve-source") +async def resolve_source(url: str): + """Определить источник по URL. Возвращает {id, slug, display_name} или null.""" + db = StateDB() + try: + domain = extract_domain(url) + row = db.get_source_by_domain(domain) + if not row: + return {"source": None, "domain": domain} + return { + "source": { + "id": row["id"], + "slug": row["slug"], + "display_name": row["display_name"], + }, + "domain": domain, + } + finally: + db.close() + + +@app.post("/api/sources/{source_id}/domains") +async def add_domain(source_id: int, body: DomainAdd): + """Добавить домен к источнику.""" + db = StateDB() + try: + source = db.get_source_by_id(source_id) + if not source: + raise HTTPException(status_code=404, detail="Источник не найден") + domain = body.domain.lower().strip() + if not domain: + raise HTTPException(status_code=400, detail="Домен не может быть пустым") + # Проверяем не занят ли домен другим источником + existing = db.get_source_by_domain(domain) + if existing and existing["id"] != source_id: + raise HTTPException( + status_code=409, + detail=f"Домен уже привязан к источнику «{existing['display_name']}»" + ) + ok = db.add_domain(source_id, domain) + if not ok: + raise HTTPException(status_code=409, detail="Домен уже существует") + await ws_manager.broadcast({ + "type": "source_domain_added", + "source_id": source_id, + "domain": domain, + }) + return {"ok": True, "domain": domain} + finally: + db.close() + + +@app.delete("/api/sources/{source_id}/domains/{domain:path}") +async def remove_domain(source_id: int, domain: str): + """Удалить домен у источника.""" + db = StateDB() + try: + source = db.get_source_by_id(source_id) + if not source: + raise HTTPException(status_code=404, detail="Источник не найден") + ok = db.remove_domain(source_id, domain) + if not ok: + raise HTTPException(status_code=404, detail="Домен не найден") + await ws_manager.broadcast({ + "type": "source_domain_removed", + "source_id": source_id, + "domain": domain, + }) + return {"ok": True} + finally: + db.close() + + +@app.post("/api/mangas/switch-source") +async def switch_manga_source(body: SwitchSourceRequest): + """Сменить источник у манги + перепривязать домен.""" + db = StateDB() + try: + manga = db.get_manga(body.url) + if not manga: + raise HTTPException(status_code=404, detail="Манга не найдена") + if manga["status"] == "downloading" and body.url in active_tasks: + raise HTTPException(status_code=400, detail="Нельзя сменить источник во время загрузки") + + new_source = db.get_source_by_id(body.source_id) + if not new_source: + raise HTTPException(status_code=404, detail="Источник не найден") + + old_source_id = manga.get("source_id") + domain = extract_domain(body.url) + + # Перепривязываем домен + if domain: + existing_domain = db.get_source_by_domain(domain) + if existing_domain and existing_domain["id"] != body.source_id: + db.remove_domain(existing_domain["id"], domain) + db.add_domain(body.source_id, domain) + + # Меняем источник у манги + db.set_manga_source(body.url, body.source_id) + + # Сбрасываем failed/partial главы → pending + reset_count = db.reset_failed_chapters(body.url) + + await ws_manager.broadcast({ + "type": "source_switched", + "url": body.url, + "old_source_id": old_source_id, + "new_source_id": body.source_id, + "new_source_name": new_source["display_name"], + "domain_rebound": bool(domain), + "chapters_reset": reset_count, + }) + return { + "ok": True, + "source_id": body.source_id, + "source_name": new_source["display_name"], + "chapters_reset": reset_count, + } + finally: + db.close() + + @app.get("/api/stats") async def global_stats(): db = StateDB() diff --git a/src/scraper.py b/src/scraper.py index 27f9063..46aa6da 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -1,665 +1,19 @@ """ -Парсер readmanga.ru: список глав и URL/байты изображений внутри главы. +Обратно-совместимый shim: делегирует вызовы ReadmangaSource. +Не используйте напрямую в новом коде — используйте src.sources.registry. """ -import asyncio -import re -import time -from dataclasses import dataclass, field -from pathlib import Path -from typing import Optional +from .sources.base import Chapter, MangaInfo # noqa: F401 — реэкспорт для импортёров +from .sources.readmanga import ReadmangaSource -from loguru import logger -from playwright.async_api import Page - -from .browser import BrowserManager +_instance = ReadmangaSource() -# ────────────────────────────────────────────── -# Модели данных -# ────────────────────────────────────────────── - -@dataclass -class Chapter: - title: str - url: str - number: float = 0.0 - volume: int = 0 +async def get_manga_info(page, url): + return await _instance.get_manga_info(page, url) -@dataclass -class MangaInfo: - title: str - url: str - chapters: list[Chapter] = field(default_factory=list) - pub_status: str = "unknown" # completed / ongoing / unknown - title_ru: str = "" # Только русский тайтл (для папки) - title_full: str = "" # Полный тайтл как на странице - description: str = "" # Описание/синопсис - genres: list[str] = field(default_factory=list) # Жанры - - -# ────────────────────────────────────────────── -# Страница манги — список глав -# ────────────────────────────────────────────── - -async def get_manga_info(page: Page, url: str) -> Optional[MangaInfo]: - """Открывает страницу манги и возвращает список всех глав.""" - logger.info("Загружаем страницу манги: {}", url) - ok = await _navigate(page, url) - if not ok: - return None - - title_full = await page.title() - title_full = re.sub(r"\s*[-–|].*$", "", title_full).strip() - - # Пробуем взять русский тайтл напрямую из DOM - title_ru = await _extract_ru_title_from_dom(page) - if not title_ru: - title_ru = _parse_ru_title(title_full) - - logger.info("Манга: {} | ru: {}", title_full, title_ru) - - pub_status = await _extract_pub_status(page) - logger.info("Статус выпуска: {}", pub_status) - - description = await _extract_description(page) - genres = await _extract_genres(page) - - await _expand_chapters(page) - chapters = await _extract_chapters(page) - if not chapters: - chapters = await _extract_chapters_alt(page) - - logger.info("Найдено глав: {}", len(chapters)) - return MangaInfo( - title=title_ru or title_full, - url=url, - chapters=chapters, - pub_status=pub_status, - title_ru=title_ru, - title_full=title_full, - description=description, - genres=genres, +async def get_chapter_images_and_download(page, chapter_url, dest_dir, + manga_url=None, on_page=None): + return await _instance.get_chapter_images_and_download( + page, chapter_url, dest_dir, manga_url=manga_url, on_page=on_page ) - - -async def _extract_ru_title_from_dom(page: Page) -> str: - """Ищет русский тайтл в структуре страницы readmanga.""" - try: - result = await page.evaluate(""" - () => { - // readmanga: основной тайтл в span.name внутри .names - const selectors = [ - '.names .name', - 'h1.manga-title', - 'h1 .name', - '.name-block .name', - ]; - for (const sel of selectors) { - const el = document.querySelector(sel); - if (el && el.textContent.trim()) return el.textContent.trim(); - } - return ''; - } - """) - return (result or "").strip() - except Exception: - return "" - - -def _parse_ru_title(full_title: str) -> str: - """Извлекает русский тайтл из полной строки тайтла. - - Примеры: - 'Манга Режим — АД. Хардкорный геймер ... (Hellmode)' → 'Режим — АД. Хардкорный геймер ...' - 'Манга Магическая битва (Sorcery Fight) Гэгэ онлайн' → 'Магическая битва' - 'Авантюрист Monster Eater Adventurer' → 'Авантюрист' - """ - t = full_title.strip() - # Убираем префикс "Манга " - t = re.sub(r'^Манга\s+', '', t).strip() - # Берём только до первой скобки (начало английского тайтла) - t = re.split(r'\s*[\(\[]', t)[0].strip() - # Убираем суффикс " онлайн" - t = re.sub(r'\s+онлайн\s*$', '', t, flags=re.IGNORECASE).strip() - - # Обрезаем хвост из латинских слов. - # Правило: стоп только на токене содержащем латиницу (a-zA-Z). - # Пунктуация между кириллическими словами (—, –, ., :, !) — сохраняем. - words = t.split() - result = [] - for w in words: - if re.search(r'[а-яёА-ЯЁ]', w): - result.append(w) - elif re.search(r'[a-zA-Z]', w): - # Первое латинское слово после кириллических — обрезаем здесь - if result: - break - else: - # Чисто пунктуационный токен (—, –, ., :, …) - # Добавляем только если уже есть кириллические слова (связка внутри) - if result: - result.append(w) - - # Убираем висячую пунктуацию в конце (если последнее слово — не кириллица) - while result and not re.search(r'[а-яёА-ЯЁ]', result[-1]): - result.pop() - - if result: - t = ' '.join(result) - return t - - -async def _extract_pub_status(page: Page) -> str: - """Извлекает статус выпуска: completed / ongoing / unknown.""" - try: - result = await page.evaluate(""" - () => { - // readmanga хранит статус в .elem_status .value или похожих блоках - const statusSelectors = [ - '.elem_status .value', - '.manga-info .status', - '[class*="status"] .value', - '.property .status', - ]; - for (const sel of statusSelectors) { - const el = document.querySelector(sel); - if (el) { - const t = el.textContent.toLowerCase(); - if (t.includes('завершён') || t.includes('завершен') || t.includes('complete')) return 'completed'; - if (t.includes('продолжает') || t.includes('ongoing')) return 'ongoing'; - } - } - // Fallback: сканируем весь текст страницы - const bodyText = document.body ? document.body.innerText.toLowerCase() : ''; - if (bodyText.includes('выпуск завершён') || bodyText.includes('выпуск завершен')) return 'completed'; - if (bodyText.includes('продолжается')) return 'ongoing'; - return 'unknown'; - } - """) - return result or "unknown" - except Exception: - return "unknown" - - -async def _extract_description(page: Page) -> str: - """Извлекает описание/синопсис манги.""" - try: - result = await page.evaluate(""" - () => { - const selectors = [ - '.manga-description', - '.elem_descr .value', - '#tab-description .description-text', - '.description', - '[itemprop="description"]', - ]; - for (const sel of selectors) { - const el = document.querySelector(sel); - if (el && el.textContent.trim()) return el.textContent.trim(); - } - return ''; - } - """) - return (result or "").strip()[:2000] # обрезаем до 2000 символов - except Exception: - return "" - - -async def _extract_genres(page: Page) -> list[str]: - """Извлекает список жанров манги.""" - try: - result = await page.evaluate(""" - () => { - const selectors = [ - '.elem_genre .value a', - '.genres a', - '[itemprop="genre"]', - '.genre-list a', - ]; - for (const sel of selectors) { - const els = document.querySelectorAll(sel); - if (els.length) return Array.from(els).map(e => e.textContent.trim()).filter(Boolean); - } - return []; - } - """) - return result or [] - except Exception: - return [] - - -async def _navigate(page: Page, url: str, retries: int = 3, - referer: str | None = None) -> bool: - from urllib.parse import urlparse - if referer is None: - p = urlparse(url) - referer = f"{p.scheme}://{p.netloc}/" - for attempt in range(1, retries + 1): - try: - resp = await page.goto(url, wait_until="domcontentloaded", - timeout=60_000, referer=referer) - if resp and resp.status >= 400: - logger.warning("Попытка {}/{}: HTTP {}", attempt, retries, resp.status) - await asyncio.sleep(3 * attempt) - continue - try: - await page.wait_for_load_state("networkidle", timeout=10_000) - except Exception: - pass - return True - except Exception as e: - logger.warning("Попытка {}/{}: {}", attempt, retries, e) - await asyncio.sleep(3 * attempt) - return False - - -async def _expand_chapters(page: Page): - for sel in ["a.chapter-link.all", "button:has-text('Все главы')", - "a:has-text('Все главы')"]: - try: - el = page.locator(sel).first - if await el.is_visible(timeout=2000): - await el.click() - await page.wait_for_load_state("networkidle", timeout=10_000) - return - except Exception: - pass - - -async def _extract_chapters(page: Page) -> list[Chapter]: - """Основной парсер: #chapters-list → tr.item-row → td[data-num] a.chapter-link""" - rows = await page.query_selector_all("#chapters-list tr.item-row") - chapters = [] - for row in rows: - link = await row.query_selector("td[class*='item-title'] a") - if not link: - continue - href = await link.get_attribute("href") or "" - text = (await link.inner_text()).strip() - if not href: - continue - td = await row.query_selector("td[data-num]") - vol = int((await td.get_attribute("data-vol") or "0")) if td else 0 - num_raw = int((await td.get_attribute("data-num") or "0")) if td else 0 - number = num_raw / 10.0 - full_url = href if href.startswith("http") else _base_url(page.url) + href - chapters.append(Chapter(title=text, url=full_url, number=number, volume=vol)) - return chapters - - -async def _extract_chapters_alt(page: Page) -> list[Chapter]: - result = await page.evaluate(""" - () => { - const links = Array.from(document.querySelectorAll('a[href*="/vol"]')); - return links.map(a => ({ href: a.href, text: a.textContent.trim() })) - .filter(x => x.href && x.text); - } - """) - return [Chapter(title=x["text"], url=x["href"], - number=_parse_num(x["text"]), volume=_parse_vol(x["text"])) - for x in result] - - -def _base_url(url: str) -> str: - m = re.match(r"(https?://[^/]+)", url) - return m.group(1) if m else "https://readmanga.ru" - - -def _parse_num(text: str) -> float: - m = re.search(r"[\d]+(?:[.,]\d+)?", text.replace(",", ".")) - return float(m.group()) if m else 0.0 - - -def _parse_vol(text: str) -> int: - m = re.search(r"Том\s+(\d+)", text, re.IGNORECASE) - return int(m.group(1)) if m else 0 - - -# ────────────────────────────────────────────── -# Страница главы — получение URL изображений -# ────────────────────────────────────────────── - -async def _extract_images_from_js(page: Page) -> list[str]: - """ - Извлекает URL из rm_h.readerInit(chapterInfo, [[base, '', path, w, h], ...]). - Считает скобки для точного захвата массива. - """ - try: - result = await page.evaluate(""" - () => { - for (const s of document.querySelectorAll('script')) { - const text = s.textContent || ''; - const mi = text.indexOf('readerInit'); - if (mi === -1) continue; - const ai = text.indexOf('[', mi); - if (ai === -1) continue; - let depth = 0, end = -1; - for (let i = ai; i < text.length; i++) { - if (text[i] === '[') depth++; - else if (text[i] === ']') { depth--; if (!depth) { end = i+1; break; } } - } - if (end === -1) continue; - try { - const arr = eval(text.slice(ai, end)); - if (Array.isArray(arr) && arr.length) - return arr.map(item => Array.isArray(item) && item.length >= 3 - ? item[0] + item[2] : null).filter(Boolean); - } catch(e) {} - } - return []; - } - """) - if result: - logger.debug("JS readerInit нашёл {} изображений", len(result)) - return result or [] - except Exception as e: - logger.debug("JS-метод не сработал: {}", e) - return [] - - -async def _extract_images_from_dom(page: Page) -> list[str]: - try: - result = await page.evaluate(""" - () => { - for (const sel of ['img.manga-page', '.page-image img', '#mangaReader img', 'img[data-src]']) { - const found = Array.from(document.querySelectorAll(sel)); - if (found.length) return found.map(i => i.src || i.dataset.src).filter(Boolean); - } - return []; - } - """) - return result or [] - except Exception: - return [] - - -def _get_ext(url: str) -> str: - m = re.search(r"\.(jpg|jpeg|png|webp)(\?|$)", url, re.IGNORECASE) - if m: - ext = m.group(1).lower() - return ".jpg" if ext == "jpeg" else f".{ext}" - return ".jpg" - - -# ────────────────────────────────────────────── -# Скачивание главы -# ────────────────────────────────────────────── - -async def get_chapter_images_and_download( - page: Page, - chapter_url: str, - dest_dir: Path, - manga_url: str | None = None, - on_page: object = None, -) -> list[Path]: - """ - 1. Открывает страницу главы (устанавливает DDoS-Guard cookies для CDN). - 2. Извлекает список URL из readerInit. - 3. Перехватывает img-запросы через page.route() + route.fetch() - (браузерный стек — правильные Sec-Fetch-* заголовки, cookies). - 4. Пролистывает читалку клавишей ArrowRight чтобы загрузить все страницы. - 5. Retry для страниц с timeout через JS fetch. - """ - t_start = time.monotonic() - ch_id = chapter_url.split("/")[-1] # короткий идентификатор для логов - logger.info("[{}] Загружаем главу: {}", ch_id, chapter_url) - - from urllib.parse import urlparse - parsed = urlparse(chapter_url) - parts = parsed.path.strip("/").split("/") - manga_slug = parts[0] if parts else "" - referer = manga_url or f"{parsed.scheme}://{parsed.netloc}/{manga_slug}" - - load_url = chapter_url + ("?mtr=1" if "?" not in chapter_url else "&mtr=1") - dest_dir.mkdir(parents=True, exist_ok=True) - - def _base(u: str) -> str: - return u.split("?")[0] - - # Баннеры/рекламные изображения — игнорируем без логирования - BANNER_RE = re.compile(r"466_p\.|570_p\.|banner|advert", re.I) - - def _is_manga_image(url: str) -> bool: - base = _base(url) - if not re.search(r"\.(jpg|jpeg|png|webp)(\?|$)", base, re.I): - return False - if "resrmr." in url or "/static/" in url: - return False - return bool(re.search(r"one-way\.work|staticfa\.|rm\.one-way|cdnmanga|reimg", url, re.I)) - - captured: dict[str, bytes] = {} # base_url → bytes - route_errors: dict[str, str] = {} # base_url → текст ошибки - route_statuses: dict[str, int] = {} # base_url → HTTP status (не 200/206) - lock = asyncio.Lock() - - async def route_handler(route, request): - url = request.url - base = _base(url) - if not _is_manga_image(url): - await route.continue_() - return - if BANNER_RE.search(base): - await route.continue_() - return - async with lock: - already = base in captured - if already: - await route.continue_() - return - fname = base.split("/")[-1] - try: - response = await route.fetch() - status = response.status - body = await response.body() - if body and len(body) > 500 and status in (200, 206): - async with lock: - if base not in captured: - captured[base] = body - logger.debug("[{}] ✓ {}: {} байт", ch_id, fname, len(body)) - if on_page: - try: - asyncio.ensure_future(on_page(0, 0)) - except Exception: - pass - else: - async with lock: - route_statuses[base] = status - if status not in (200, 206): - logger.warning("[{}] CDN HTTP {} для '{}' | {}", - ch_id, status, fname, base[-70:]) - else: - logger.warning("[{}] Слишком мал ответ ({} байт) для '{}'", - ch_id, len(body), fname) - await route.fulfill(response=response) - except Exception as e: - err = str(e) - async with lock: - route_errors[base] = err - is_timeout = "timeout" in err.lower() - level = logger.warning if is_timeout else logger.warning - level("[{}] route.fetch {} '{}': {}", - ch_id, "timeout" if is_timeout else "ошибка", fname, err[:150]) - try: - await route.continue_() - except Exception: - pass - - await page.route("**/*", route_handler) - - # 1. Открываем главу - ok = await _navigate(page, load_url, referer=referer) - if not ok: - await page.unroute("**/*", route_handler) - logger.error("[{}] Не удалось открыть главу после всех retry: {}", ch_id, chapter_url) - return [] - - # 2. Ждём readerInit - try: - await page.wait_for_function( - "() => Array.from(document.querySelectorAll('script'))" - ".some(s => s.textContent.includes('readerInit'))", - timeout=15_000, - ) - except Exception as e: - logger.warning("[{}] readerInit не появился за 15с ({}). " - "Продолжаем через DOM-fallback.", ch_id, str(e)[:80]) - - # 3. Извлекаем список URL - image_urls = await _extract_images_from_js(page) - if not image_urls: - logger.debug("[{}] JS readerInit не дал URL, пробуем DOM-парсинг", ch_id) - image_urls = await _extract_images_from_dom(page) - if not image_urls: - await page.unroute("**/*", route_handler) - try: - page_info = await page.evaluate("() => document.title + ' | ' + location.href") - except Exception: - page_info = "?" - logger.error("[{}] Список изображений пуст. Текущая страница: {}", ch_id, page_info) - return [] - - logger.info("[{}] Найдено изображений: {}", ch_id, len(image_urls)) - url_to_idx = {_base(u): i for i, u in enumerate(image_urls)} - filename_to_idx = {_base(u).split("/")[-1]: i for i, u in enumerate(image_urls)} - total = len(image_urls) - - def _count_matched() -> int: - count = 0 - for base_url in captured: - if base_url in url_to_idx or base_url.split("/")[-1] in filename_to_idx: - count += 1 - return count - - # 4. Пролистываем читалку - await asyncio.sleep(1) - stall_count = 0 - prev_done = -1 - for i in range(total + 20): - done = _count_matched() - if done >= total: - break - try: - await page.keyboard.press("ArrowRight") - await asyncio.sleep(0.5) - except Exception as e: - logger.warning("[{}] Ошибка листания на шаге {}: {}", ch_id, i + 1, e) - break - if i % 20 == 19: - done = _count_matched() - logger.debug("[{}] Пролистано {}, загружено: {}/{}", ch_id, i + 1, done, total) - if done == prev_done: - stall_count += 1 - if stall_count >= 3: - logger.warning("[{}] Прогресс завис ({}/{}) после {} листаний — прерываем", - ch_id, done, total, i + 1) - break - else: - stall_count = 0 - prev_done = done - - # Финальное ожидание - await asyncio.sleep(3) - - # 5. Retry для страниц с timeout через браузерный JS fetch - async with lock: - timeout_bases = [u for u, e in route_errors.items() - if "timeout" in e.lower() and u not in captured] - if timeout_bases: - logger.info("[{}] Retry {} страниц с timeout через JS fetch...", - ch_id, len(timeout_bases)) - for retry_base in timeout_bases: - if retry_base in captured: - continue - fname = retry_base.split("/")[-1] - try: - data_b64 = await page.evaluate("""async (url) => { - try { - const r = await fetch(url, {credentials: 'include'}); - if (!r.ok) return null; - const buf = await r.arrayBuffer(); - const bytes = new Uint8Array(buf); - let bin = ''; - for (let b of bytes) bin += String.fromCharCode(b); - return btoa(bin); - } catch(e) { return null; } - }""", retry_base) - if data_b64: - import base64 - body = base64.b64decode(data_b64) - if len(body) > 500: - async with lock: - captured[retry_base] = body - logger.info("[{}] Retry OK: {} ({} байт)", ch_id, fname, len(body)) - else: - logger.warning("[{}] Retry вернул {} байт для '{}' — игнорируем", - ch_id, len(body), fname) - else: - logger.warning("[{}] Retry вернул null для '{}' | {}", - ch_id, fname, retry_base[-70:]) - except Exception as e2: - logger.warning("[{}] Retry JS ошибка для '{}': {}", ch_id, fname, e2) - - await page.unroute("**/*", route_handler) - - done = _count_matched() - elapsed = time.monotonic() - t_start - logger.info("[{}] Перехвачено: {}/{} за {:.1f}с", ch_id, done, total, elapsed) - - # 6. Сохраняем в правильном порядке - filename_to_idx = {_base(u).split("/")[-1]: i for i, u in enumerate(image_urls)} - - paths: dict[int, Path] = {} - unmatched_other: list[str] = [] - for base_url, body in captured.items(): - idx = url_to_idx.get(base_url) - if idx is None: - fname = base_url.split("/")[-1] - idx = filename_to_idx.get(fname) - if idx is None: - if not BANNER_RE.search(base_url): - unmatched_other.append(base_url.split("/")[-1]) - continue - ext = _get_ext(base_url) - p = dest_dir / f"{idx:04d}{ext}" - p.write_bytes(body) - paths[idx] = p - - if unmatched_other: - logger.debug("[{}] Перехвачено, но не совпало с readerInit ({}): {}", - ch_id, len(unmatched_other), unmatched_other) - - # 7. Итоговый отчёт по пропущенным страницам - missing_idxs = [i for i in range(total) if i not in paths] - if missing_idxs: - missing_files = [_base(image_urls[i]).split("/")[-1] for i in missing_idxs] - missing_full = [_base(image_urls[i]) for i in missing_idxs] - - timeout_miss = [missing_files[j] for j, i in enumerate(missing_idxs) - if missing_full[j] in route_errors - and "timeout" in route_errors[missing_full[j]].lower()] - http_miss = [f"{missing_files[j]}(HTTP {route_statuses.get(missing_full[j], '?')})" - for j, i in enumerate(missing_idxs) - if missing_full[j] in route_statuses] - unrcv = [missing_files[j] for j, i in enumerate(missing_idxs) - if missing_full[j] not in route_errors - and missing_full[j] not in route_statuses] - - reasons = [] - if timeout_miss: - reasons.append(f"timeout×{len(timeout_miss)}: {timeout_miss}") - if http_miss: - reasons.append(f"HTTP-err×{len(http_miss)}: {http_miss}") - if unrcv: - reasons.append(f"не_перехвачено×{len(unrcv)}: {unrcv}") - - logger.warning( - "[{}] Пропущено {}/{} стр. | №: {} | причины: {}", - ch_id, len(missing_idxs), total, - [i + 1 for i in missing_idxs], - " | ".join(reasons) if reasons else "неизвестно", - ) - logger.debug("[{}] Полные URL пропущенных: {}", ch_id, missing_full) - - return [paths[i] for i in sorted(paths.keys())] - diff --git a/src/sources/__init__.py b/src/sources/__init__.py new file mode 100644 index 0000000..95666f0 --- /dev/null +++ b/src/sources/__init__.py @@ -0,0 +1,74 @@ +""" +Реестр источников манги. + +Для добавления нового источника: +1. Создать файл src/sources/mysource.py с классом, реализующим MangaSourceProtocol +2. Импортировать его здесь и добавить в список SOURCES +""" +from urllib.parse import urlparse +from typing import Optional + +from .base import MangaSourceProtocol +from .readmanga import ReadmangaSource + +# ── Регистрация источников ───────────────────── +# Добавьте новые источники сюда: +SOURCES: list = [ + ReadmangaSource(), +] + +# Быстрый поиск по slug +_BY_SLUG: dict[str, object] = {s.slug: s for s in SOURCES} + + +class SourceRegistry: + """Реестр источников. Источники определяются только в коде.""" + + def get_by_slug(self, slug: str) -> Optional[object]: + return _BY_SLUG.get(slug) + + def get_by_db_id(self, source_id: int, db) -> Optional[object]: + """Резолвит адаптер через БД: source_id → slug → экземпляр.""" + row = db.get_source_by_id(source_id) + if not row: + return None + return _BY_SLUG.get(row["slug"]) + + def all_sources(self) -> list: + return list(SOURCES) + + def all_slugs(self) -> list[str]: + return [s.slug for s in SOURCES] + + +registry = SourceRegistry() + + +def get_source_for_url(url: str, db) -> Optional[object]: + """ + Определяет источник по домену URL. + Ищет домен в таблице source_domains → возвращает адаптер. + Если домен не зарегистрирован — возвращает None. + """ + try: + domain = urlparse(url).netloc.lower() + if domain.startswith("www."): + domain = domain[4:] + row = db.get_source_by_domain(domain) + if not row: + return None + return _BY_SLUG.get(row["slug"]) + except Exception: + return None + + +def extract_domain(url: str) -> str: + """Извлекает домен без www.""" + try: + domain = urlparse(url).netloc.lower() + if domain.startswith("www."): + domain = domain[4:] + return domain + except Exception: + return "" + diff --git a/src/sources/base.py b/src/sources/base.py new file mode 100644 index 0000000..9438340 --- /dev/null +++ b/src/sources/base.py @@ -0,0 +1,58 @@ +""" +Базовые модели данных и Protocol-интерфейс для источников манги. +""" +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional, Protocol, runtime_checkable + +from playwright.async_api import Page + + +# ────────────────────────────────────────────── +# Модели данных (общие для всех источников) +# ────────────────────────────────────────────── + +@dataclass +class Chapter: + title: str + url: str + number: float = 0.0 + volume: int = 0 + + +@dataclass +class MangaInfo: + title: str + url: str + chapters: list[Chapter] = field(default_factory=list) + pub_status: str = "unknown" # completed / ongoing / unknown + title_ru: str = "" + title_full: str = "" + description: str = "" + genres: list[str] = field(default_factory=list) + + +# ────────────────────────────────────────────── +# Интерфейс источника +# ────────────────────────────────────────────── + +@runtime_checkable +class MangaSourceProtocol(Protocol): + slug: str # уникальный код источника в коде ("readmanga") + display_name: str # название для UI ("ReadManga") + + async def get_manga_info(self, page: Page, url: str) -> Optional[MangaInfo]: + """Возвращает информацию о манге и список глав.""" + ... + + async def get_chapter_images_and_download( + self, + page: Page, + chapter_url: str, + dest_dir: Path, + manga_url: Optional[str] = None, + on_page: object = None, + ) -> list[Path]: + """Скачивает страницы главы в dest_dir и возвращает список путей.""" + ... + diff --git a/src/sources/readmanga.py b/src/sources/readmanga.py new file mode 100644 index 0000000..71241c7 --- /dev/null +++ b/src/sources/readmanga.py @@ -0,0 +1,589 @@ +""" +Адаптер ReadManga: поддерживает readmanga.ru и все его клоны. +""" +import asyncio +import base64 +import re +import time +from pathlib import Path +from typing import Optional + +from loguru import logger +from playwright.async_api import Page + +from .base import Chapter, MangaInfo + + +class ReadmangaSource: + slug = "readmanga" + display_name = "ReadManga" + + # CDN-домены из которых принимаем картинки глав + cdn_patterns = ["one-way.work", "staticfa.", "rm.one-way", "cdnmanga", "reimg"] + + # ────────────────────────────────────────────── + # Страница манги — список глав + # ────────────────────────────────────────────── + + async def get_manga_info(self, page: Page, url: str) -> Optional[MangaInfo]: + """Открывает страницу манги и возвращает список всех глав.""" + logger.info("Загружаем страницу манги: {}", url) + ok = await _navigate(page, url) + if not ok: + return None + + title_full = await page.title() + title_full = re.sub(r"\s*[-–|].*$", "", title_full).strip() + + title_ru = await _extract_ru_title_from_dom(page) + if not title_ru: + title_ru = _parse_ru_title(title_full) + + logger.info("Манга: {} | ru: {}", title_full, title_ru) + + pub_status = await _extract_pub_status(page) + logger.info("Статус выпуска: {}", pub_status) + + description = await _extract_description(page) + genres = await _extract_genres(page) + + await _expand_chapters(page) + chapters = await _extract_chapters(page) + if not chapters: + chapters = await _extract_chapters_alt(page) + + logger.info("Найдено глав: {}", len(chapters)) + return MangaInfo( + title=title_ru or title_full, + url=url, + chapters=chapters, + pub_status=pub_status, + title_ru=title_ru, + title_full=title_full, + description=description, + genres=genres, + ) + + # ────────────────────────────────────────────── + # Скачивание главы + # ────────────────────────────────────────────── + + async def get_chapter_images_and_download( + self, + page: Page, + chapter_url: str, + dest_dir: Path, + manga_url: Optional[str] = None, + on_page: object = None, + ) -> list[Path]: + """ + 1. Открывает страницу главы. + 2. Извлекает список URL из readerInit. + 3. Перехватывает img-запросы через page.route(). + 4. Пролистывает читалку клавишей ArrowRight. + 5. Retry для страниц с timeout через JS fetch. + """ + cdn_patterns = self.cdn_patterns + t_start = time.monotonic() + ch_id = chapter_url.split("/")[-1] + logger.info("[{}] Загружаем главу: {}", ch_id, chapter_url) + + from urllib.parse import urlparse + parsed = urlparse(chapter_url) + parts = parsed.path.strip("/").split("/") + manga_slug = parts[0] if parts else "" + referer = manga_url or f"{parsed.scheme}://{parsed.netloc}/{manga_slug}" + + load_url = chapter_url + ("?mtr=1" if "?" not in chapter_url else "&mtr=1") + dest_dir.mkdir(parents=True, exist_ok=True) + + def _base(u: str) -> str: + return u.split("?")[0] + + BANNER_RE = re.compile(r"466_p\.|570_p\.|banner|advert", re.I) + + def _is_manga_image(url: str) -> bool: + base = _base(url) + if not re.search(r"\.(jpg|jpeg|png|webp)(\?|$)", base, re.I): + return False + if "resrmr." in url or "/static/" in url: + return False + pattern = "|".join(re.escape(p) for p in cdn_patterns) + return bool(re.search(pattern, url, re.I)) + + captured: dict[str, bytes] = {} + route_errors: dict[str, str] = {} + route_statuses: dict[str, int] = {} + lock = asyncio.Lock() + + async def route_handler(route, request): + url = request.url + base = _base(url) + if not _is_manga_image(url): + await route.continue_() + return + if BANNER_RE.search(base): + await route.continue_() + return + async with lock: + already = base in captured + if already: + await route.continue_() + return + fname = base.split("/")[-1] + try: + response = await route.fetch() + status = response.status + body = await response.body() + if body and len(body) > 500 and status in (200, 206): + async with lock: + if base not in captured: + captured[base] = body + logger.debug("[{}] ✓ {}: {} байт", ch_id, fname, len(body)) + if on_page: + try: + asyncio.ensure_future(on_page(0, 0)) + except Exception: + pass + else: + async with lock: + route_statuses[base] = status + if status not in (200, 206): + logger.warning("[{}] CDN HTTP {} для '{}' | {}", + ch_id, status, fname, base[-70:]) + else: + logger.warning("[{}] Слишком мал ответ ({} байт) для '{}'", + ch_id, len(body), fname) + await route.fulfill(response=response) + except Exception as e: + err = str(e) + async with lock: + route_errors[base] = err + is_timeout = "timeout" in err.lower() + logger.warning("[{}] route.fetch {} '{}': {}", + ch_id, "timeout" if is_timeout else "ошибка", fname, err[:150]) + try: + await route.continue_() + except Exception: + pass + + await page.route("**/*", route_handler) + + ok = await _navigate(page, load_url, referer=referer) + if not ok: + await page.unroute("**/*", route_handler) + logger.error("[{}] Не удалось открыть главу: {}", ch_id, chapter_url) + return [] + + try: + await page.wait_for_function( + "() => Array.from(document.querySelectorAll('script'))" + ".some(s => s.textContent.includes('readerInit'))", + timeout=15_000, + ) + except Exception as e: + logger.warning("[{}] readerInit не появился за 15с ({}). DOM-fallback.", ch_id, str(e)[:80]) + + image_urls = await _extract_images_from_js(page) + if not image_urls: + logger.debug("[{}] JS readerInit не дал URL, пробуем DOM-парсинг", ch_id) + image_urls = await _extract_images_from_dom(page) + if not image_urls: + await page.unroute("**/*", route_handler) + try: + page_info = await page.evaluate("() => document.title + ' | ' + location.href") + except Exception: + page_info = "?" + logger.error("[{}] Список изображений пуст. Страница: {}", ch_id, page_info) + return [] + + logger.info("[{}] Найдено изображений: {}", ch_id, len(image_urls)) + url_to_idx = {_base(u): i for i, u in enumerate(image_urls)} + filename_to_idx = {_base(u).split("/")[-1]: i for i, u in enumerate(image_urls)} + total = len(image_urls) + + def _count_matched() -> int: + count = 0 + for base_url in captured: + if base_url in url_to_idx or base_url.split("/")[-1] in filename_to_idx: + count += 1 + return count + + await asyncio.sleep(1) + stall_count = 0 + prev_done = -1 + for i in range(total + 20): + done = _count_matched() + if done >= total: + break + try: + await page.keyboard.press("ArrowRight") + await asyncio.sleep(0.5) + except Exception as e: + logger.warning("[{}] Ошибка листания на шаге {}: {}", ch_id, i + 1, e) + break + if i % 20 == 19: + done = _count_matched() + logger.debug("[{}] Пролистано {}, загружено: {}/{}", ch_id, i + 1, done, total) + if done == prev_done: + stall_count += 1 + if stall_count >= 3: + logger.warning("[{}] Прогресс завис ({}/{}) — прерываем", ch_id, done, total) + break + else: + stall_count = 0 + prev_done = done + + await asyncio.sleep(3) + + # Retry timeout через JS fetch + async with lock: + timeout_bases = [u for u, e in route_errors.items() + if "timeout" in e.lower() and u not in captured] + if timeout_bases: + logger.info("[{}] Retry {} страниц с timeout...", ch_id, len(timeout_bases)) + for retry_base in timeout_bases: + if retry_base in captured: + continue + fname = retry_base.split("/")[-1] + try: + data_b64 = await page.evaluate("""async (url) => { + try { + const r = await fetch(url, {credentials: 'include'}); + if (!r.ok) return null; + const buf = await r.arrayBuffer(); + const bytes = new Uint8Array(buf); + let bin = ''; + for (let b of bytes) bin += String.fromCharCode(b); + return btoa(bin); + } catch(e) { return null; } + }""", retry_base) + if data_b64: + body = base64.b64decode(data_b64) + if len(body) > 500: + async with lock: + captured[retry_base] = body + logger.info("[{}] Retry OK: {} ({} байт)", ch_id, fname, len(body)) + else: + logger.warning("[{}] Retry вернул {} байт — игнорируем", ch_id, len(body)) + else: + logger.warning("[{}] Retry null для '{}'", ch_id, fname) + except Exception as e2: + logger.warning("[{}] Retry JS ошибка '{}': {}", ch_id, fname, e2) + + await page.unroute("**/*", route_handler) + + done = _count_matched() + elapsed = time.monotonic() - t_start + logger.info("[{}] Перехвачено: {}/{} за {:.1f}с", ch_id, done, total, elapsed) + + filename_to_idx = {_base(u).split("/")[-1]: i for i, u in enumerate(image_urls)} + + paths: dict[int, Path] = {} + unmatched_other: list[str] = [] + for base_url, body in captured.items(): + idx = url_to_idx.get(base_url) + if idx is None: + fname = base_url.split("/")[-1] + idx = filename_to_idx.get(fname) + if idx is None: + if not BANNER_RE.search(base_url): + unmatched_other.append(base_url.split("/")[-1]) + continue + ext = _get_ext(base_url) + p = dest_dir / f"{idx:04d}{ext}" + p.write_bytes(body) + paths[idx] = p + + if unmatched_other: + logger.debug("[{}] Не совпало с readerInit ({}): {}", ch_id, len(unmatched_other), unmatched_other) + + missing_idxs = [i for i in range(total) if i not in paths] + if missing_idxs: + missing_files = [_base(image_urls[i]).split("/")[-1] for i in missing_idxs] + missing_full = [_base(image_urls[i]) for i in missing_idxs] + + timeout_miss = [missing_files[j] for j, i in enumerate(missing_idxs) + if missing_full[j] in route_errors + and "timeout" in route_errors[missing_full[j]].lower()] + http_miss = [f"{missing_files[j]}(HTTP {route_statuses.get(missing_full[j], '?')})" + for j, i in enumerate(missing_idxs) + if missing_full[j] in route_statuses] + unrcv = [missing_files[j] for j, i in enumerate(missing_idxs) + if missing_full[j] not in route_errors + and missing_full[j] not in route_statuses] + + reasons = [] + if timeout_miss: + reasons.append(f"timeout×{len(timeout_miss)}: {timeout_miss}") + if http_miss: + reasons.append(f"HTTP-err×{len(http_miss)}: {http_miss}") + if unrcv: + reasons.append(f"не_перехвачено×{len(unrcv)}: {unrcv}") + + logger.warning( + "[{}] Пропущено {}/{} стр. | №: {} | причины: {}", + ch_id, len(missing_idxs), total, + [i + 1 for i in missing_idxs], + " | ".join(reasons) if reasons else "неизвестно", + ) + + return [paths[i] for i in sorted(paths.keys())] + + +# ────────────────────────────────────────────── +# Вспомогательные функции (приватные) +# ────────────────────────────────────────────── + +async def _navigate(page: Page, url: str, retries: int = 3, + referer: str | None = None) -> bool: + from urllib.parse import urlparse + if referer is None: + p = urlparse(url) + referer = f"{p.scheme}://{p.netloc}/" + for attempt in range(1, retries + 1): + try: + resp = await page.goto(url, wait_until="domcontentloaded", + timeout=60_000, referer=referer) + if resp and resp.status >= 400: + logger.warning("Попытка {}/{}: HTTP {}", attempt, retries, resp.status) + await asyncio.sleep(3 * attempt) + continue + try: + await page.wait_for_load_state("networkidle", timeout=10_000) + except Exception: + pass + return True + except Exception as e: + logger.warning("Попытка {}/{}: {}", attempt, retries, e) + await asyncio.sleep(3 * attempt) + return False + + +async def _extract_ru_title_from_dom(page: Page) -> str: + try: + result = await page.evaluate(""" + () => { + const selectors = [ + '.names .name', 'h1.manga-title', 'h1 .name', '.name-block .name', + ]; + for (const sel of selectors) { + const el = document.querySelector(sel); + if (el && el.textContent.trim()) return el.textContent.trim(); + } + return ''; + } + """) + return (result or "").strip() + except Exception: + return "" + + +def _parse_ru_title(full_title: str) -> str: + t = full_title.strip() + t = re.sub(r'^Манга\s+', '', t).strip() + t = re.split(r'\s*[\(\[]', t)[0].strip() + t = re.sub(r'\s+онлайн\s*$', '', t, flags=re.IGNORECASE).strip() + words = t.split() + result = [] + for w in words: + if re.search(r'[а-яёА-ЯЁ]', w): + result.append(w) + elif re.search(r'[a-zA-Z]', w): + if result: + break + else: + if result: + result.append(w) + while result and not re.search(r'[а-яёА-ЯЁ]', result[-1]): + result.pop() + if result: + t = ' '.join(result) + return t + + +async def _extract_pub_status(page: Page) -> str: + try: + result = await page.evaluate(""" + () => { + const statusSelectors = [ + '.elem_status .value', '.manga-info .status', + '[class*="status"] .value', '.property .status', + ]; + for (const sel of statusSelectors) { + const el = document.querySelector(sel); + if (el) { + const t = el.textContent.toLowerCase(); + if (t.includes('завершён') || t.includes('завершен') || t.includes('complete')) return 'completed'; + if (t.includes('продолжает') || t.includes('ongoing')) return 'ongoing'; + } + } + const bodyText = document.body ? document.body.innerText.toLowerCase() : ''; + if (bodyText.includes('выпуск завершён') || bodyText.includes('выпуск завершен')) return 'completed'; + if (bodyText.includes('продолжается')) return 'ongoing'; + return 'unknown'; + } + """) + return result or "unknown" + except Exception: + return "unknown" + + +async def _extract_description(page: Page) -> str: + try: + result = await page.evaluate(""" + () => { + const selectors = [ + '.manga-description', '.elem_descr .value', + '#tab-description .description-text', '.description', + '[itemprop="description"]', + ]; + for (const sel of selectors) { + const el = document.querySelector(sel); + if (el && el.textContent.trim()) return el.textContent.trim(); + } + return ''; + } + """) + return (result or "").strip()[:2000] + except Exception: + return "" + + +async def _extract_genres(page: Page) -> list[str]: + try: + result = await page.evaluate(""" + () => { + const selectors = [ + '.elem_genre .value a', '.genres a', + '[itemprop="genre"]', '.genre-list a', + ]; + for (const sel of selectors) { + const els = document.querySelectorAll(sel); + if (els.length) return Array.from(els).map(e => e.textContent.trim()).filter(Boolean); + } + return []; + } + """) + return result or [] + except Exception: + return [] + + +async def _expand_chapters(page: Page): + for sel in ["a.chapter-link.all", "button:has-text('Все главы')", "a:has-text('Все главы')"]: + try: + el = page.locator(sel).first + if await el.is_visible(timeout=2000): + await el.click() + await page.wait_for_load_state("networkidle", timeout=10_000) + return + except Exception: + pass + + +async def _extract_chapters(page: Page) -> list[Chapter]: + rows = await page.query_selector_all("#chapters-list tr.item-row") + chapters = [] + for row in rows: + link = await row.query_selector("td[class*='item-title'] a") + if not link: + continue + href = await link.get_attribute("href") or "" + text = (await link.inner_text()).strip() + if not href: + continue + td = await row.query_selector("td[data-num]") + vol = int((await td.get_attribute("data-vol") or "0")) if td else 0 + num_raw = int((await td.get_attribute("data-num") or "0")) if td else 0 + number = num_raw / 10.0 + full_url = href if href.startswith("http") else _base_url(page.url) + href + chapters.append(Chapter(title=text, url=full_url, number=number, volume=vol)) + return chapters + + +async def _extract_chapters_alt(page: Page) -> list[Chapter]: + result = await page.evaluate(""" + () => { + const links = Array.from(document.querySelectorAll('a[href*="/vol"]')); + return links.map(a => ({ href: a.href, text: a.textContent.trim() })) + .filter(x => x.href && x.text); + } + """) + return [Chapter(title=x["text"], url=x["href"], + number=_parse_num(x["text"]), volume=_parse_vol(x["text"])) + for x in result] + + +async def _extract_images_from_js(page: Page) -> list[str]: + try: + result = await page.evaluate(""" + () => { + for (const s of document.querySelectorAll('script')) { + const text = s.textContent || ''; + const mi = text.indexOf('readerInit'); + if (mi === -1) continue; + const ai = text.indexOf('[', mi); + if (ai === -1) continue; + let depth = 0, end = -1; + for (let i = ai; i < text.length; i++) { + if (text[i] === '[') depth++; + else if (text[i] === ']') { depth--; if (!depth) { end = i+1; break; } } + } + if (end === -1) continue; + try { + const arr = eval(text.slice(ai, end)); + if (Array.isArray(arr) && arr.length) + return arr.map(item => Array.isArray(item) && item.length >= 3 + ? item[0] + item[2] : null).filter(Boolean); + } catch(e) {} + } + return []; + } + """) + if result: + logger.debug("JS readerInit нашёл {} изображений", len(result)) + return result or [] + except Exception as e: + logger.debug("JS-метод не сработал: {}", e) + return [] + + +async def _extract_images_from_dom(page: Page) -> list[str]: + try: + result = await page.evaluate(""" + () => { + for (const sel of ['img.manga-page', '.page-image img', '#mangaReader img', 'img[data-src]']) { + const found = Array.from(document.querySelectorAll(sel)); + if (found.length) return found.map(i => i.src || i.dataset.src).filter(Boolean); + } + return []; + } + """) + return result or [] + except Exception: + return [] + + +def _get_ext(url: str) -> str: + m = re.search(r"\.(jpg|jpeg|png|webp)(\?|$)", url, re.IGNORECASE) + if m: + ext = m.group(1).lower() + return ".jpg" if ext == "jpeg" else f".{ext}" + return ".jpg" + + +def _base_url(url: str) -> str: + m = re.match(r"(https?://[^/]+)", url) + return m.group(1) if m else "https://readmanga.ru" + + +def _parse_num(text: str) -> float: + m = re.search(r"[\d]+(?:[.,]\d+)?", text.replace(",", ".")) + return float(m.group()) if m else 0.0 + + +def _parse_vol(text: str) -> int: + m = re.search(r"Том\s+(\d+)", text, re.IGNORECASE) + return int(m.group(1)) if m else 0 + diff --git a/src/state.py b/src/state.py index 773100b..4a2bc68 100644 --- a/src/state.py +++ b/src/state.py @@ -1,14 +1,25 @@ """ Хранение состояния скачивания в SQLite. """ +import json import sqlite3 from datetime import datetime from pathlib import Path from typing import Optional +from urllib.parse import urlparse DB_PATH = Path("/app/state/progress.db") +# Домены ReadManga по умолчанию (сидинг при первом запуске) +_DEFAULT_READMANGA_DOMAINS = [ + "readmanga.ru", + "readmanga.live", + "readmanga.me", + "readmanga.io", + "3.readmanga.ru", +] + class StateDB: def __init__(self, db_path: Path = DB_PATH): @@ -68,18 +79,35 @@ class StateDB: created_at TEXT ) """) + self.conn.execute(""" + CREATE TABLE IF NOT EXISTS sources ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + slug TEXT UNIQUE NOT NULL, + display_name TEXT NOT NULL, + settings TEXT DEFAULT '{}', + created_at TEXT + ) + """) + self.conn.execute(""" + CREATE TABLE IF NOT EXISTS source_domains ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_id INTEGER NOT NULL REFERENCES sources(id), + domain TEXT UNIQUE NOT NULL + ) + """) # Migrate old DB: add missing columns migrations = [ - ("chapters", "pages_total", "INTEGER DEFAULT 0"), - ("chapters", "pages_done", "INTEGER DEFAULT 0"), - ("mangas", "title_ru", "TEXT"), - ("mangas", "title_full", "TEXT"), - ("mangas", "pub_status", "TEXT DEFAULT 'unknown'"), - ("mangas", "auto_update", "INTEGER DEFAULT 0"), - ("mangas", "last_checked_at", "TEXT"), - ("mangas", "started_at", "TEXT"), - ("mangas", "finished_at", "TEXT"), - ("mangas", "folder_name", "TEXT"), + ("chapters", "pages_total", "INTEGER DEFAULT 0"), + ("chapters", "pages_done", "INTEGER DEFAULT 0"), + ("mangas", "title_ru", "TEXT"), + ("mangas", "title_full", "TEXT"), + ("mangas", "pub_status", "TEXT DEFAULT 'unknown'"), + ("mangas", "auto_update", "INTEGER DEFAULT 0"), + ("mangas", "last_checked_at","TEXT"), + ("mangas", "started_at", "TEXT"), + ("mangas", "finished_at", "TEXT"), + ("mangas", "folder_name", "TEXT"), + ("mangas", "source_id", "INTEGER REFERENCES sources(id)"), ] for table, col, typedef in migrations: try: @@ -88,17 +116,184 @@ class StateDB: pass self.conn.commit() + def sync_sources(self, registry) -> None: + """ + Синхронизирует таблицу sources с реестром из кода. + Вызывается при старте приложения. + При первом запуске создаёт записи и засеивает домены ReadManga. + """ + from loguru import logger + for source in registry.all_sources(): + existing = self.conn.execute( + "SELECT id, display_name FROM sources WHERE slug=?", (source.slug,) + ).fetchone() + if not existing: + self.conn.execute( + "INSERT INTO sources (slug, display_name, settings, created_at) VALUES (?,?,?,?)", + (source.slug, source.display_name, "{}", _now()) + ) + logger.info("Источник добавлен в БД: {} ({})", source.display_name, source.slug) + else: + if existing["display_name"] != source.display_name: + self.conn.execute( + "UPDATE sources SET display_name=? WHERE slug=?", + (source.display_name, source.slug) + ) + self.conn.commit() + + # Сидинг доменов ReadManga при первом запуске + rm = self.conn.execute("SELECT id FROM sources WHERE slug='readmanga'").fetchone() + if rm: + count = self.conn.execute( + "SELECT COUNT(*) FROM source_domains WHERE source_id=?", (rm["id"],) + ).fetchone()[0] + if count == 0: + for domain in _DEFAULT_READMANGA_DOMAINS: + try: + self.conn.execute( + "INSERT INTO source_domains (source_id, domain) VALUES (?,?)", + (rm["id"], domain) + ) + except Exception: + pass + self.conn.commit() + logger.info("Сидинг доменов ReadManga: {} доменов", len(_DEFAULT_READMANGA_DOMAINS)) + + # Логируем источники в БД без кода (не в реестре) + known_slugs = set(registry.all_slugs()) + db_slugs = [r["slug"] for r in self.conn.execute("SELECT slug FROM sources").fetchall()] + for slug in db_slugs: + if slug not in known_slugs: + logger.warning("Источник '{}' есть в БД, но отсутствует в реестре — манги недоступны", slug) + + def migrate_manga_sources(self) -> int: + """ + Авто-миграция: проставляет source_id для манг с source_id IS NULL. + Определяет источник по домену URL через source_domains. + Возвращает количество обновлённых манг. + """ + nulls = self.conn.execute( + "SELECT url FROM mangas WHERE source_id IS NULL" + ).fetchall() + updated = 0 + for row in nulls: + url = row["url"] + domain = _extract_domain(url) + source_row = self.get_source_by_domain(domain) + if source_row: + self.conn.execute( + "UPDATE mangas SET source_id=? WHERE url=?", + (source_row["id"], url) + ) + updated += 1 + if updated: + self.conn.commit() + return updated + + # ── Sources ─────────────────────────────────── + + def get_source_by_id(self, source_id: int) -> Optional[dict]: + row = self.conn.execute("SELECT * FROM sources WHERE id=?", (source_id,)).fetchone() + return dict(row) if row else None + + def get_source_by_slug(self, slug: str) -> Optional[dict]: + row = self.conn.execute("SELECT * FROM sources WHERE slug=?", (slug,)).fetchone() + return dict(row) if row else None + + def get_source_by_domain(self, domain: str) -> Optional[dict]: + """Возвращает запись source по домену (через source_domains JOIN).""" + row = self.conn.execute(""" + SELECT s.* FROM sources s + JOIN source_domains sd ON sd.source_id = s.id + WHERE sd.domain=? + """, (domain.lower(),)).fetchone() + return dict(row) if row else None + + def get_all_sources(self) -> list[dict]: + """Возвращает все источники с вложенным списком доменов.""" + sources = self.conn.execute("SELECT * FROM sources ORDER BY id").fetchall() + result = [] + for s in sources: + s_dict = dict(s) + domains = self.conn.execute( + "SELECT domain FROM source_domains WHERE source_id=? ORDER BY domain", + (s["id"],) + ).fetchall() + s_dict["domains"] = [d["domain"] for d in domains] + try: + s_dict["settings"] = json.loads(s_dict.get("settings") or "{}") + except Exception: + s_dict["settings"] = {} + result.append(s_dict) + return result + + def add_domain(self, source_id: int, domain: str) -> bool: + """Добавляет домен к источнику. Возвращает False если уже существует.""" + domain = domain.lower().strip() + try: + self.conn.execute( + "INSERT INTO source_domains (source_id, domain) VALUES (?,?)", + (source_id, domain) + ) + self.conn.commit() + return True + except Exception: + return False + + def remove_domain(self, source_id: int, domain: str) -> bool: + """Удаляет домен у источника. Возвращает True если удалён.""" + cur = self.conn.execute( + "DELETE FROM source_domains WHERE source_id=? AND domain=?", + (source_id, domain.lower()) + ) + self.conn.commit() + return cur.rowcount > 0 + + def set_manga_source(self, manga_url: str, source_id: int) -> None: + """Меняет источник у манги.""" + self.conn.execute( + "UPDATE mangas SET source_id=?, updated_at=? WHERE url=?", + (source_id, _now(), manga_url) + ) + self.conn.commit() + + def reset_failed_chapters(self, manga_url: str) -> int: + """Сбрасывает failed и partial главы в pending. Возвращает количество.""" + now = _now() + c1 = self.conn.execute( + "UPDATE chapters SET status='pending', pages_done=0, pages_total=0, updated_at=? " + "WHERE manga_url=? AND status='failed'", + (now, manga_url) + ).rowcount + c2 = self.conn.execute( + """UPDATE chapters SET status='pending', pages_done=0, pages_total=0, updated_at=? + WHERE manga_url=? AND status='done' + AND pages_total > 0 AND pages_done < pages_total""", + (now, manga_url) + ).rowcount + self.conn.commit() + return c1 + c2 + + def count_mangas_by_source_domain(self, domain: str) -> int: + """Считает манги с указанным доменом (для предупреждений в UI).""" + source = self.get_source_by_domain(domain) + if not source: + return 0 + return self.conn.execute( + "SELECT COUNT(*) FROM mangas WHERE source_id=?", (source["id"],) + ).fetchone()[0] + # ── Mangas ──────────────────────────────────── - def add_manga(self, url: str, fmt: str = "cbz") -> bool: + def add_manga(self, url: str, fmt: str = "cbz", source_id: Optional[int] = None) -> bool: """Добавляет мангу в очередь. Возвращает True если новая.""" cur = self.conn.execute("SELECT id FROM mangas WHERE url=?", (url,)) if cur.fetchone(): return False self.conn.execute(""" - INSERT INTO mangas (url, format, status, added_at, updated_at) - VALUES (?, ?, 'queued', ?, ?) - """, (url, fmt, _now(), _now())) + INSERT INTO mangas (url, format, status, source_id, added_at, updated_at) + VALUES (?, ?, 'queued', ?, ?, ?) + """, (url, fmt, source_id, _now(), _now())) self.conn.commit() return True @@ -318,3 +513,15 @@ class StateDB: def _now() -> str: return datetime.utcnow().isoformat() + +def _extract_domain(url: str) -> str: + """Извлекает домен без www.""" + try: + domain = urlparse(url).netloc.lower() + if domain.startswith("www."): + domain = domain[4:] + return domain + except Exception: + return "" + + diff --git a/src/worker.py b/src/worker.py index 202781d..03daf7a 100644 --- a/src/worker.py +++ b/src/worker.py @@ -11,7 +11,9 @@ from typing import Callable, Optional from loguru import logger from .browser import BrowserManager -from .scraper import get_manga_info, get_chapter_images_and_download, Chapter +from .sources import registry, get_source_for_url, extract_domain +from .sources.base import Chapter, MangaInfo +from .scraper import get_manga_info, get_chapter_images_and_download # shim для обратной совместимости from .exporter import export, MangaMeta from .state import StateDB @@ -61,10 +63,23 @@ async def download_manga( started_ts = await db_call(db.mark_started, url) await emit({"type": "manga_start", "url": url, "started_at": started_ts}) + # Резолвим источник + source = get_source_for_url(url, db) + if source is None: + # Последний шанс: по source_id в БД + manga_row = await db_call(db.get_manga, url) + if manga_row and manga_row.get("source_id"): + source = registry.get_by_db_id(manga_row["source_id"], db) + if source is None: + await db_call(db.update_manga_status, url, "failed") + await emit({"type": "source_unknown", "url": url, + "error": "Источник не определён. Выберите источник в настройках манги."}) + return + async with BrowserManager(headless=True) as bm: ctx, info_page = await bm.new_page() - manga = await get_manga_info(info_page, url) + manga = await source.get_manga_info(info_page, url) await info_page.close() if not manga: @@ -193,7 +208,7 @@ async def download_manga( "pages_total": pages_total, }) - image_paths = await get_chapter_images_and_download( + image_paths = await source.get_chapter_images_and_download( ch_page, ch.url, dest_dir=tmp_path, manga_url=url, @@ -329,9 +344,19 @@ async def check_for_updates( db.add_history(manga_url=url, event_type="check_started") await emit({"type": "check_started", "url": url}) + # Резолвим источник + source = get_source_for_url(url, db) + if source is None: + manga_row = db.get_manga(url) + if manga_row and manga_row.get("source_id"): + source = registry.get_by_db_id(manga_row["source_id"], db) + if source is None: + await emit({"type": "source_unknown", "url": url}) + return [] + async with BrowserManager(headless=True) as bm: _, page = await bm.new_page() - manga = await get_manga_info(page, url) + manga = await source.get_manga_info(page, url) await page.close() if not manga: return []