Base app

2026-04-29 02:07:21 +03:00
parent ba6bfc5ed3
commit 0aa057c991
14 changed files with 4257 additions and 139 deletions
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -3,6 +3,7 @@
 """
 import asyncio
 import re
+import time
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Optional
@@ -30,6 +31,11 @@ class MangaInfo:
    title: str
    url: str
    chapters: list[Chapter] = field(default_factory=list)
+    pub_status: str = "unknown"   # completed / ongoing / unknown
+    title_ru: str = ""            # Только русский тайтл (для папки)
+    title_full: str = ""          # Полный тайтл как на странице
+    description: str = ""         # Описание/синопсис
+    genres: list[str] = field(default_factory=list)  # Жанры


 # ──────────────────────────────────────────────
@@ -43,9 +49,21 @@ async def get_manga_info(page: Page, url: str) -> Optional[MangaInfo]:
    if not ok:
        return None

-    title = await page.title()
-    title = re.sub(r"\s*[-–|].*$", "", title).strip()
-    logger.info("Манга: {}", title)
+    title_full = await page.title()
+    title_full = re.sub(r"\s*[-–|].*$", "", title_full).strip()
+
+    # Пробуем взять русский тайтл напрямую из DOM
+    title_ru = await _extract_ru_title_from_dom(page)
+    if not title_ru:
+        title_ru = _parse_ru_title(title_full)
+
+    logger.info("Манга: {} | ru: {}", title_full, title_ru)
+
+    pub_status = await _extract_pub_status(page)
+    logger.info("Статус выпуска: {}", pub_status)
+
+    description = await _extract_description(page)
+    genres = await _extract_genres(page)

    await _expand_chapters(page)
    chapters = await _extract_chapters(page)
@@ -53,7 +71,162 @@ async def get_manga_info(page: Page, url: str) -> Optional[MangaInfo]:
        chapters = await _extract_chapters_alt(page)

    logger.info("Найдено глав: {}", len(chapters))
-    return MangaInfo(title=title, url=url, chapters=chapters)
+    return MangaInfo(
+        title=title_ru or title_full,
+        url=url,
+        chapters=chapters,
+        pub_status=pub_status,
+        title_ru=title_ru,
+        title_full=title_full,
+        description=description,
+        genres=genres,
+    )
+
+
+async def _extract_ru_title_from_dom(page: Page) -> str:
+    """Ищет русский тайтл в структуре страницы readmanga."""
+    try:
+        result = await page.evaluate("""
+            () => {
+                // readmanga: основной тайтл в span.name внутри .names
+                const selectors = [
+                    '.names .name',
+                    'h1.manga-title',
+                    'h1 .name',
+                    '.name-block .name',
+                ];
+                for (const sel of selectors) {
+                    const el = document.querySelector(sel);
+                    if (el && el.textContent.trim()) return el.textContent.trim();
+                }
+                return '';
+            }
+        """)
+        return (result or "").strip()
+    except Exception:
+        return ""
+
+
+def _parse_ru_title(full_title: str) -> str:
+    """Извлекает русский тайтл из полной строки тайтла.
+
+    Примеры:
+      'Манга Режим — АД. Хардкорный геймер ... (Hellmode)' → 'Режим — АД. Хардкорный геймер ...'
+      'Манга Магическая битва (Sorcery Fight) Гэгэ онлайн' → 'Магическая битва'
+      'Авантюрист Monster Eater Adventurer'                → 'Авантюрист'
+    """
+    t = full_title.strip()
+    # Убираем префикс "Манга "
+    t = re.sub(r'^Манга\s+', '', t).strip()
+    # Берём только до первой скобки (начало английского тайтла)
+    t = re.split(r'\s*[\(\[]', t)[0].strip()
+    # Убираем суффикс " онлайн"
+    t = re.sub(r'\s+онлайн\s*$', '', t, flags=re.IGNORECASE).strip()
+
+    # Обрезаем хвост из латинских слов.
+    # Правило: стоп только на токене содержащем латиницу (a-zA-Z).
+    # Пунктуация между кириллическими словами (—, –, ., :, !) — сохраняем.
+    words = t.split()
+    result = []
+    for w in words:
+        if re.search(r'[а-яёА-ЯЁ]', w):
+            result.append(w)
+        elif re.search(r'[a-zA-Z]', w):
+            # Первое латинское слово после кириллических — обрезаем здесь
+            if result:
+                break
+        else:
+            # Чисто пунктуационный токен (—, –, ., :, …)
+            # Добавляем только если уже есть кириллические слова (связка внутри)
+            if result:
+                result.append(w)
+
+    # Убираем висячую пунктуацию в конце (если последнее слово — не кириллица)
+    while result and not re.search(r'[а-яёА-ЯЁ]', result[-1]):
+        result.pop()
+
+    if result:
+        t = ' '.join(result)
+    return t
+
+
+async def _extract_pub_status(page: Page) -> str:
+    """Извлекает статус выпуска: completed / ongoing / unknown."""
+    try:
+        result = await page.evaluate("""
+            () => {
+                // readmanga хранит статус в .elem_status .value или похожих блоках
+                const statusSelectors = [
+                    '.elem_status .value',
+                    '.manga-info .status',
+                    '[class*="status"] .value',
+                    '.property .status',
+                ];
+                for (const sel of statusSelectors) {
+                    const el = document.querySelector(sel);
+                    if (el) {
+                        const t = el.textContent.toLowerCase();
+                        if (t.includes('завершён') || t.includes('завершен') || t.includes('complete')) return 'completed';
+                        if (t.includes('продолжает') || t.includes('ongoing')) return 'ongoing';
+                    }
+                }
+                // Fallback: сканируем весь текст страницы
+                const bodyText = document.body ? document.body.innerText.toLowerCase() : '';
+                if (bodyText.includes('выпуск завершён') || bodyText.includes('выпуск завершен')) return 'completed';
+                if (bodyText.includes('продолжается')) return 'ongoing';
+                return 'unknown';
+            }
+        """)
+        return result or "unknown"
+    except Exception:
+        return "unknown"
+
+
+async def _extract_description(page: Page) -> str:
+    """Извлекает описание/синопсис манги."""
+    try:
+        result = await page.evaluate("""
+            () => {
+                const selectors = [
+                    '.manga-description',
+                    '.elem_descr .value',
+                    '#tab-description .description-text',
+                    '.description',
+                    '[itemprop="description"]',
+                ];
+                for (const sel of selectors) {
+                    const el = document.querySelector(sel);
+                    if (el && el.textContent.trim()) return el.textContent.trim();
+                }
+                return '';
+            }
+        """)
+        return (result or "").strip()[:2000]  # обрезаем до 2000 символов
+    except Exception:
+        return ""
+
+
+async def _extract_genres(page: Page) -> list[str]:
+    """Извлекает список жанров манги."""
+    try:
+        result = await page.evaluate("""
+            () => {
+                const selectors = [
+                    '.elem_genre .value a',
+                    '.genres a',
+                    '[itemprop="genre"]',
+                    '.genre-list a',
+                ];
+                for (const sel of selectors) {
+                    const els = document.querySelectorAll(sel);
+                    if (els.length) return Array.from(els).map(e => e.textContent.trim()).filter(Boolean);
+                }
+                return [];
+            }
+        """)
+        return result or []
+    except Exception:
+        return []


 async def _navigate(page: Page, url: str, retries: int = 3,
@@ -218,6 +391,7 @@ async def get_chapter_images_and_download(
    chapter_url: str,
    dest_dir: Path,
    manga_url: str | None = None,
+    on_page: object = None,
 ) -> list[Path]:
    """
    1. Открывает страницу главы (устанавливает DDoS-Guard cookies для CDN).
@@ -225,8 +399,11 @@ async def get_chapter_images_and_download(
    3. Перехватывает img-запросы через page.route() + route.fetch()
       (браузерный стек — правильные Sec-Fetch-* заголовки, cookies).
    4. Пролистывает читалку клавишей ArrowRight чтобы загрузить все страницы.
+    5. Retry для страниц с timeout через JS fetch.
    """
-    logger.info("Загружаем главу: {}", chapter_url)
+    t_start = time.monotonic()
+    ch_id = chapter_url.split("/")[-1]   # короткий идентификатор для логов
+    logger.info("[{}] Загружаем главу: {}", ch_id, chapter_url)

    from urllib.parse import urlparse
    parsed = urlparse(chapter_url)
@@ -240,22 +417,20 @@ async def get_chapter_images_and_download(
    def _base(u: str) -> str:
        return u.split("?")[0]

-    # CDN домены которые хостят изображения манги (не статику сайта)
-    CDN_RE = re.compile(r"(?<!\bstatic\b)(^|[./])one-way\.work|staticfa\.|cdnmanga|reimg", re.I)
-    IMG_RE = re.compile(r"\.(jpg|jpeg|png|webp)(\?|$)", re.I)
+    # Баннеры/рекламные изображения — игнорируем без логирования
+    BANNER_RE = re.compile(r"466_p\.|570_p\.|banner|advert", re.I)

-    # Более точный фильтр: только image-хосты, не resrmr/статика
    def _is_manga_image(url: str) -> bool:
        base = _base(url)
-        if not IMG_RE.search(base):
+        if not re.search(r"\.(jpg|jpeg|png|webp)(\?|$)", base, re.I):
            return False
-        # Исключаем статику сайта (логотипы, иконки, шрифты)
        if "resrmr." in url or "/static/" in url:
            return False
-        # Принимаем image CDN
        return bool(re.search(r"one-way\.work|staticfa\.|rm\.one-way|cdnmanga|reimg", url, re.I))

-    captured: dict[str, bytes] = {}   # base_url → bytes
+    captured: dict[str, bytes] = {}      # base_url → bytes
+    route_errors: dict[str, str] = {}    # base_url → текст ошибки
+    route_statuses: dict[str, int] = {}  # base_url → HTTP status (не 200/206)
    lock = asyncio.Lock()

    async def route_handler(route, request):
@@ -264,23 +439,47 @@ async def get_chapter_images_and_download(
        if not _is_manga_image(url):
            await route.continue_()
            return
-        # Уже есть — пропускаем
+        if BANNER_RE.search(base):
+            await route.continue_()
+            return
        async with lock:
            already = base in captured
        if already:
            await route.continue_()
            return
+        fname = base.split("/")[-1]
        try:
            response = await route.fetch()
+            status = response.status
            body = await response.body()
-            if body and len(body) > 500 and response.status in (200, 206):
+            if body and len(body) > 500 and status in (200, 206):
                async with lock:
                    if base not in captured:
                        captured[base] = body
-                        logger.debug("✓ {}: {} байт", base.split("/")[-1], len(body))
+                        logger.debug("[{}] ✓ {}: {} байт", ch_id, fname, len(body))
+                        if on_page:
+                            try:
+                                asyncio.ensure_future(on_page(0, 0))
+                            except Exception:
+                                pass
+            else:
+                async with lock:
+                    route_statuses[base] = status
+                if status not in (200, 206):
+                    logger.warning("[{}] CDN HTTP {} для '{}' | {}",
+                                   ch_id, status, fname, base[-70:])
+                else:
+                    logger.warning("[{}] Слишком мал ответ ({} байт) для '{}'",
+                                   ch_id, len(body), fname)
            await route.fulfill(response=response)
        except Exception as e:
-            logger.debug("route.fetch {}: {}", base[-40:], e)
+            err = str(e)
+            async with lock:
+                route_errors[base] = err
+            is_timeout = "timeout" in err.lower()
+            level = logger.warning if is_timeout else logger.warning
+            level("[{}] route.fetch {} '{}': {}",
+                  ch_id, "timeout" if is_timeout else "ошибка", fname, err[:150])
            try:
                await route.continue_()
            except Exception:
@@ -292,7 +491,7 @@ async def get_chapter_images_and_download(
    ok = await _navigate(page, load_url, referer=referer)
    if not ok:
        await page.unroute("**/*", route_handler)
-        logger.error("Не удалось открыть главу: {}", chapter_url)
+        logger.error("[{}] Не удалось открыть главу после всех retry: {}", ch_id, chapter_url)
        return []

    # 2. Ждём readerInit
@@ -302,63 +501,165 @@ async def get_chapter_images_and_download(
            ".some(s => s.textContent.includes('readerInit'))",
            timeout=15_000,
        )
-    except Exception:
-        logger.debug("readerInit не появился за 15с")
+    except Exception as e:
+        logger.warning("[{}] readerInit не появился за 15с ({}). "
+                       "Продолжаем через DOM-fallback.", ch_id, str(e)[:80])

    # 3. Извлекаем список URL
    image_urls = await _extract_images_from_js(page)
    if not image_urls:
+        logger.debug("[{}] JS readerInit не дал URL, пробуем DOM-парсинг", ch_id)
        image_urls = await _extract_images_from_dom(page)
    if not image_urls:
        await page.unroute("**/*", route_handler)
-        logger.error("Список изображений пуст: {}", chapter_url)
+        try:
+            page_info = await page.evaluate("() => document.title + ' | ' + location.href")
+        except Exception:
+            page_info = "?"
+        logger.error("[{}] Список изображений пуст. Текущая страница: {}", ch_id, page_info)
        return []

-    logger.info("Найдено изображений: {}", len(image_urls))
+    logger.info("[{}] Найдено изображений: {}", ch_id, len(image_urls))
    url_to_idx = {_base(u): i for i, u in enumerate(image_urls)}
+    filename_to_idx = {_base(u).split("/")[-1]: i for i, u in enumerate(image_urls)}
    total = len(image_urls)

-    # 4. Пролистываем читалку — reader грузит страницы по мере листания
+    def _count_matched() -> int:
+        count = 0
+        for base_url in captured:
+            if base_url in url_to_idx or base_url.split("/")[-1] in filename_to_idx:
+                count += 1
+        return count
+
+    # 4. Пролистываем читалку
    await asyncio.sleep(1)
-    for i in range(total + 10):
-        async with lock:
-            done = len(captured)
+    stall_count = 0
+    prev_done = -1
+    for i in range(total + 20):
+        done = _count_matched()
        if done >= total:
            break
        try:
            await page.keyboard.press("ArrowRight")
            await asyncio.sleep(0.5)
-        except Exception:
+        except Exception as e:
+            logger.warning("[{}] Ошибка листания на шаге {}: {}", ch_id, i + 1, e)
            break
        if i % 20 == 19:
-            async with lock:
-                done = len(captured)
-            logger.debug("Пролистано {}, загружено: {}/{}", i + 1, done, total)
+            done = _count_matched()
+            logger.debug("[{}] Пролистано {}, загружено: {}/{}", ch_id, i + 1, done, total)
+            if done == prev_done:
+                stall_count += 1
+                if stall_count >= 3:
+                    logger.warning("[{}] Прогресс завис ({}/{}) после {} листаний — прерываем",
+                                   ch_id, done, total, i + 1)
+                    break
+            else:
+                stall_count = 0
+            prev_done = done

    # Финальное ожидание
    await asyncio.sleep(3)
+
+    # 5. Retry для страниц с timeout через браузерный JS fetch
+    async with lock:
+        timeout_bases = [u for u, e in route_errors.items()
+                         if "timeout" in e.lower() and u not in captured]
+    if timeout_bases:
+        logger.info("[{}] Retry {} страниц с timeout через JS fetch...",
+                    ch_id, len(timeout_bases))
+        for retry_base in timeout_bases:
+            if retry_base in captured:
+                continue
+            fname = retry_base.split("/")[-1]
+            try:
+                data_b64 = await page.evaluate("""async (url) => {
+                    try {
+                        const r = await fetch(url, {credentials: 'include'});
+                        if (!r.ok) return null;
+                        const buf = await r.arrayBuffer();
+                        const bytes = new Uint8Array(buf);
+                        let bin = '';
+                        for (let b of bytes) bin += String.fromCharCode(b);
+                        return btoa(bin);
+                    } catch(e) { return null; }
+                }""", retry_base)
+                if data_b64:
+                    import base64
+                    body = base64.b64decode(data_b64)
+                    if len(body) > 500:
+                        async with lock:
+                            captured[retry_base] = body
+                        logger.info("[{}] Retry OK: {} ({} байт)", ch_id, fname, len(body))
+                    else:
+                        logger.warning("[{}] Retry вернул {} байт для '{}' — игнорируем",
+                                       ch_id, len(body), fname)
+                else:
+                    logger.warning("[{}] Retry вернул null для '{}' | {}",
+                                   ch_id, fname, retry_base[-70:])
+            except Exception as e2:
+                logger.warning("[{}] Retry JS ошибка для '{}': {}", ch_id, fname, e2)
+
    await page.unroute("**/*", route_handler)

-    async with lock:
-        done = len(captured)
-    logger.info("Перехвачено: {}/{}", done, total)
+    done = _count_matched()
+    elapsed = time.monotonic() - t_start
+    logger.info("[{}] Перехвачено: {}/{} за {:.1f}с", ch_id, done, total, elapsed)
+
+    # 6. Сохраняем в правильном порядке
+    filename_to_idx = {_base(u).split("/")[-1]: i for i, u in enumerate(image_urls)}

-    # 5. Сохраняем в правильном порядке
    paths: dict[int, Path] = {}
+    unmatched_other: list[str] = []
    for base_url, body in captured.items():
-        if base_url not in url_to_idx:
+        idx = url_to_idx.get(base_url)
+        if idx is None:
+            fname = base_url.split("/")[-1]
+            idx = filename_to_idx.get(fname)
+        if idx is None:
+            if not BANNER_RE.search(base_url):
+                unmatched_other.append(base_url.split("/")[-1])
            continue
-        idx = url_to_idx[base_url]
        ext = _get_ext(base_url)
        p = dest_dir / f"{idx:04d}{ext}"
        p.write_bytes(body)
        paths[idx] = p

-    missing = total - len(paths)
-    if missing:
-        logger.warning("Не загружено страниц: {}", missing)
+    if unmatched_other:
+        logger.debug("[{}] Перехвачено, но не совпало с readerInit ({}): {}",
+                     ch_id, len(unmatched_other), unmatched_other)
+
+    # 7. Итоговый отчёт по пропущенным страницам
+    missing_idxs = [i for i in range(total) if i not in paths]
+    if missing_idxs:
+        missing_files = [_base(image_urls[i]).split("/")[-1] for i in missing_idxs]
+        missing_full  = [_base(image_urls[i]) for i in missing_idxs]
+
+        timeout_miss = [missing_files[j] for j, i in enumerate(missing_idxs)
+                        if missing_full[j] in route_errors
+                        and "timeout" in route_errors[missing_full[j]].lower()]
+        http_miss    = [f"{missing_files[j]}(HTTP {route_statuses.get(missing_full[j], '?')})"
+                        for j, i in enumerate(missing_idxs)
+                        if missing_full[j] in route_statuses]
+        unrcv        = [missing_files[j] for j, i in enumerate(missing_idxs)
+                        if missing_full[j] not in route_errors
+                        and missing_full[j] not in route_statuses]
+
+        reasons = []
+        if timeout_miss:
+            reasons.append(f"timeout×{len(timeout_miss)}: {timeout_miss}")
+        if http_miss:
+            reasons.append(f"HTTP-err×{len(http_miss)}: {http_miss}")
+        if unrcv:
+            reasons.append(f"не_перехвачено×{len(unrcv)}: {unrcv}")
+
+        logger.warning(
+            "[{}] Пропущено {}/{} стр. | №: {} | причины: {}",
+            ch_id, len(missing_idxs), total,
+            [i + 1 for i in missing_idxs],
+            " | ".join(reasons) if reasons else "неизвестно",
+        )
+        logger.debug("[{}] Полные URL пропущенных: {}", ch_id, missing_full)

    return [paths[i] for i in sorted(paths.keys())]

-
-