This commit is contained in:
2026-05-02 21:59:59 +03:00
parent 419614d295
commit a7eaa22646
5 changed files with 672 additions and 372 deletions

View File

@@ -234,9 +234,9 @@ async def _analyze(url: str):
paths = await source.get_chapter_images_and_download(
page, first.url, dest_dir=Path(tmp), manga_url=url
)
click.echo(f" Скачано изображений: {len(paths)}")
for p in paths[:3]:
click.echo(f" {p.name} ({p.stat().st_size} байт)")
click.echo(f" Скачано изображений: {len(paths)}")
for p in paths[:3]:
click.echo(f" {p.name} ({p.stat().st_size} байт)")
db.close()

View File

@@ -10,11 +10,13 @@ from typing import Optional
from .base import MangaSourceProtocol
from .readmanga import ReadmangaSource
from .mangalib import MangalibSource
# ── Регистрация источников ─────────────────────
# Добавьте новые источники сюда:
SOURCES: list = [
ReadmangaSource(),
MangalibSource(),
]
# Быстрый поиск по slug

640
src/sources/mangalib.py Normal file
View File

@@ -0,0 +1,640 @@
"""
Адаптер MangaLib: поддерживает mangalib.me и его зеркала.
Принцип работы:
- Список глав: перехватываем ответ api.cdnlibs.org/api/manga/{slug}/chapters
Возвращает все главы сразу (не требует пагинации).
URL главы: {origin}/ru/{manga_slug}/read/v{vol}/c{num}
- Страница главы: перехватываем ответ api.cdnlibs.org/api/manga/{slug}/chapter?...
Получаем pages[] с полями: image (filename), url (relative path), slug (page index 1-based).
Изображения: {server}{page.url}, CDN = mixlib.me / imglib.info.
"""
import asyncio
import json as _json
import re
import time
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse
from loguru import logger
from playwright.async_api import Page
from .base import Chapter, MangaInfo
class MangalibSource:
slug = "mangalib"
display_name = "MangaLib"
# CDN-домены для изображений глав (актуальные)
cdn_patterns = ["mixlib.me", "imglib.info", "imglib", "imgslib"]
# ──────────────────────────────────────────────
# Страница манги — список глав
# ──────────────────────────────────────────────
async def get_manga_info(self, page: Page, url: str) -> Optional[MangaInfo]:
"""Открывает страницу манги и возвращает список всех глав."""
logger.info("Загружаем страницу манги MangaLib: {}", url)
chapters_url = _ensure_chapters_section(url)
base_manga_url = url.split("?")[0].rstrip("/")
# Слушаем API-ответы до навигации
chapters_api_data: list = []
manga_api_data: dict = {}
lock = asyncio.Lock()
async def on_response(resp):
resp_url = resp.url
if "api.cdnlibs.org" not in resp_url:
return
try:
# api.cdnlibs.org/api/manga/{slug}/chapters (без query-параметров)
if re.search(r"/chapters$", resp_url):
body = await resp.body()
data = _json.loads(body)
raw = data.get("data", [])
if isinstance(raw, list) and raw:
async with lock:
if not chapters_api_data:
chapters_api_data.extend(raw)
logger.debug("Chapters API: {} глав получено", len(raw))
# api.cdnlibs.org/api/manga/{slug}?fields[]=...
elif re.search(r"/manga/[^/]+$", resp_url.split("?")[0]) and "fields" in resp_url:
body = await resp.body()
data = _json.loads(body)
raw = data.get("data", {})
if isinstance(raw, dict) and raw:
async with lock:
if not manga_api_data:
manga_api_data.update(raw)
except Exception as e:
logger.debug("API parse error: {}", e)
page.on("response", on_response)
ok = await _navigate(page, chapters_url)
if not ok:
page.remove_listener("response", on_response)
return None
# Ждём API-ответов (обычно приходят за 1-3 секунды)
for _ in range(30):
async with lock:
if chapters_api_data:
break
await asyncio.sleep(0.3)
page.remove_listener("response", on_response)
# Извлекаем pub_status из API манги (надёжнее DOM)
async with lock:
manga_meta = dict(manga_api_data)
pub_status = _pub_status_from_api(manga_meta)
if pub_status == "unknown":
pub_status = await _extract_pub_status(page)
# Предпочитаем имена из API (надёжнее DOM и page.title)
async with lock:
manga_meta_snap = dict(manga_api_data)
title_ru = (manga_meta_snap.get("rus_name") or "").strip()
title_name = (manga_meta_snap.get("name") or "").strip()
if not title_ru:
title_ru = await _extract_title(page)
title_full = (f"{title_ru} / {title_name}" if title_name and title_ru and title_name != title_ru
else title_ru or title_name)
if not title_full:
try:
page_title = await page.title()
page_title = re.sub(r"\s*([-|•]|читать|онлайн).*$", "", page_title, flags=re.IGNORECASE).strip()
title_full = page_title
except Exception:
pass
if not title_ru:
title_ru = title_full
logger.info("Манга: {} | ru: {}", title_full, title_ru)
logger.info("Статус выпуска: {}", pub_status)
description = await _extract_description(page)
genres = await _extract_genres(page)
async with lock:
raw_chapters = list(chapters_api_data)
if raw_chapters:
chapters = _chapters_from_api(raw_chapters, base_manga_url)
else:
logger.warning("Chapters API не ответил, используем DOM-fallback")
chapters = await _chapters_from_dom(page, base_manga_url)
logger.info("Найдено глав: {}", len(chapters))
return MangaInfo(
title=title_ru or title_full,
url=url,
chapters=chapters,
pub_status=pub_status,
title_ru=title_ru,
title_full=title_full,
description=description,
genres=genres,
)
# ──────────────────────────────────────────────
# Скачивание главы
# ──────────────────────────────────────────────
async def get_chapter_images_and_download(
self,
page: Page,
chapter_url: str,
dest_dir: Path,
manga_url: Optional[str] = None,
on_page: object = None,
) -> list[Path]:
"""
1. Открывает страницу читалки.
2. Пассивно наблюдает ответы через page.on("response"):
- api.cdnlibs.org/chapter? → список страниц
- api.cdnlibs.org/imageServers → серверы CDN
3. Скачивает все страницы через page.context.request.get()
(разделяет cookies с браузером, без CORS-ограничений).
"""
t_start = time.monotonic()
ch_id = chapter_url.rstrip("/").split("/")[-1]
logger.info("[{}] Загружаем главу MangaLib: {}", ch_id, chapter_url)
dest_dir.mkdir(parents=True, exist_ok=True)
referer_origin = _base_url(manga_url or chapter_url)
chapter_api: dict = {}
image_servers: list = []
lock = asyncio.Lock()
async def on_response(resp):
resp_url = resp.url
try:
if "api.cdnlibs.org" in resp_url and "/chapter?" in resp_url:
body = await resp.body()
data = _json.loads(body)
async with lock:
if not chapter_api.get("pages"):
chapter_api.update(data.get("data", {}))
elif "api.cdnlibs.org" in resp_url and "imageServers" in resp_url:
body = await resp.body()
data = _json.loads(body)
servers = data.get("data", {}).get("imageServers", [])
async with lock:
if not image_servers:
image_servers.extend(s["url"] for s in servers if "url" in s)
except Exception:
pass
page.on("response", on_response)
referer = manga_url or referer_origin
ok = await _navigate(page, chapter_url, referer=referer)
if not ok:
page.remove_listener("response", on_response)
logger.error("[{}] Не удалось открыть главу: {}", ch_id, chapter_url)
return []
# Ждём ответ chapter API (обычно приходит за 1-3 секунды)
for _ in range(40):
async with lock:
if chapter_api.get("pages"):
break
await asyncio.sleep(0.5)
page.remove_listener("response", on_response)
async with lock:
pages_info = list(chapter_api.get("pages", []))
servers_list = list(image_servers)
if not pages_info:
try:
page_info = await page.evaluate("() => document.title + ' | ' + location.href")
except Exception:
page_info = "?"
logger.error("[{}] API не вернул данные страниц. Страница: {}", ch_id, page_info)
return []
total = len(pages_info)
logger.info("[{}] Страниц по API: {}", ch_id, total)
# Строим маппинг: filename → 0-based index (slug 1-based)
fname_to_idx: dict[str, int] = {}
page_url_by_idx: dict[int, str] = {}
for p in pages_info:
try:
idx = int(p.get("slug", 0)) - 1
if idx < 0:
continue
fname = p.get("image", "")
url_part = p.get("url", "")
if fname:
fname_to_idx[fname] = idx
if url_part:
page_url_by_idx[idx] = url_part
url_fname = url_part.rstrip("/").split("/")[-1]
if url_fname and url_fname not in fname_to_idx:
fname_to_idx[url_fname] = idx
except Exception:
pass
# Определяем CDN сервер из img src или constants API
server = await _detect_server(page, servers_list)
logger.info("[{}] CDN сервер: {}", ch_id, server)
alt_servers = [s for s in servers_list if s != server]
# Скачиваем все страницы через Playwright APIRequestContext
captured: dict[str, bytes] = {}
failed_idxs: list[int] = []
all_servers = [server] + alt_servers
logger.info("[{}] Скачиваем {} страниц...", ch_id, total)
for idx in range(total):
url_part = page_url_by_idx.get(idx, "")
if not url_part:
continue
fname = url_part.rstrip("/").split("/")[-1]
body = None
for srv in all_servers:
body = await _api_fetch(page, srv + url_part, referer_origin)
if body:
if srv != server:
logger.debug("[{}] alt сервер OK: {} ({})", ch_id, fname, srv)
break
if body:
captured[fname] = body
logger.debug("[{}] ✓ {}: {} байт", ch_id, fname, len(body))
if on_page:
try:
asyncio.ensure_future(on_page(0, 0))
except Exception:
pass
else:
failed_idxs.append(idx)
# Retry провалившихся страниц с задержкой
if failed_idxs:
logger.info("[{}] Retry {} страниц с задержкой...", ch_id, len(failed_idxs))
await asyncio.sleep(2)
for idx in failed_idxs:
url_part = page_url_by_idx.get(idx, "")
if not url_part:
continue
fname = url_part.rstrip("/").split("/")[-1]
body = None
for srv in all_servers:
body = await _api_fetch(page, srv + url_part, referer_origin)
if body:
break
if body:
captured[fname] = body
logger.info("[{}] Retry OK: {} ({} байт)", ch_id, fname, len(body))
if on_page:
try:
asyncio.ensure_future(on_page(0, 0))
except Exception:
pass
else:
logger.warning("[{}] Не удалось скачать: {}", ch_id, fname)
elapsed = time.monotonic() - t_start
matched = sum(1 for f in captured if f in fname_to_idx)
logger.info("[{}] Скачано: {}/{} за {:.1f}с", ch_id, matched, total, elapsed)
# Сохраняем файлы
paths: dict[int, Path] = {}
for fname, body in captured.items():
idx = fname_to_idx.get(fname)
if idx is None:
continue
ext = _get_ext(fname)
p = dest_dir / f"{idx:04d}{ext}"
p.write_bytes(body)
paths[idx] = p
missing_idxs = [i for i in range(total) if i not in paths]
if missing_idxs:
logger.warning("[{}] Пропущено {}/{} стр. №: {}",
ch_id, len(missing_idxs), total, [i + 1 for i in missing_idxs])
return [paths[i] for i in sorted(paths.keys())]
# ──────────────────────────────────────────────
# Вспомогательные функции (приватные)
# ──────────────────────────────────────────────
def _ensure_chapters_section(url: str) -> str:
if "section=chapters" in url:
return url
sep = "&" if "?" in url else "?"
return url + sep + "section=chapters"
def _manga_slug_from_url(url: str) -> str:
"""Извлекает slug манги из URL страницы или главы.
Примеры входных URL:
https://mangalib.me/ru/manga/11312--subete... → 11312--subete...
https://mangalib.me/ru/11312--subete.../read/v1/c1 → 11312--subete...
"""
parsed = urlparse(url)
parts = [p for p in parsed.path.split("/") if p]
# Убираем языковой префикс ('ru', 'en', ...)
if parts and len(parts[0]) <= 3 and parts[0].isalpha():
parts = parts[1:]
# Убираем 'manga' если есть
if parts and parts[0] == "manga":
parts = parts[1:]
return parts[0] if parts else ""
def _chapters_from_api(raw: list, manga_url: str) -> list[Chapter]:
"""Строит список глав из ответа api.cdnlibs.org/chapters."""
parsed = urlparse(manga_url)
origin = f"{parsed.scheme}://{parsed.netloc}"
slug = _manga_slug_from_url(manga_url)
# Определяем языковой префикс из оригинального URL (/ru/, /en/, ...)
path_parts = [p for p in parsed.path.split("/") if p]
lang_prefix = path_parts[0] if path_parts and len(path_parts[0]) <= 3 else "ru"
chapters = []
for ch in raw:
try:
vol = str(ch.get("volume") or "1")
num = str(ch.get("number") or "0")
name = ch.get("name") or ""
try:
number_f = float(num)
except Exception:
number_f = 0.0
try:
vol_i = int(float(vol))
except Exception:
vol_i = 0
ch_url = f"{origin}/{lang_prefix}/{slug}/read/v{vol}/c{num}"
title = f"Том {vol}, Глава {num}"
if name:
title += f" - {name}"
chapters.append(Chapter(title=title, url=ch_url, number=number_f, volume=vol_i))
except Exception as e:
logger.debug("Пропуск главы из API: {}", e)
chapters.sort(key=lambda c: (c.volume, c.number))
return chapters
async def _chapters_from_dom(page: Page, manga_url: str) -> list[Chapter]:
"""Fallback: извлекает главы из DOM-ссылок вида /read/v{vol}/c{num}."""
try:
raw = await page.evaluate("""
() => {
const links = Array.from(document.querySelectorAll('a[href*="/read/v"]'));
const result = [];
const seen = new Set();
for (const a of links) {
const href = a.href;
if (!href || seen.has(href)) continue;
if (!/\\/read\\/v\\d/.test(href)) continue;
const text = a.textContent.trim();
// Пропускаем ссылки без нормального текста (кнопки навигации и т.п.)
if (!text || /^\\d+\\s*\\/\\s*\\d+$/.test(text)) continue;
seen.add(href);
result.push({ href, text });
}
return result;
}
""")
if not raw:
return []
chapters = []
for item in raw:
href = item["href"]
m = re.search(r"/read/v(\d+(?:\.\d+)?)/c(\d+(?:\.\d+)?)", href)
if not m:
continue
vol_s, num_s = m.group(1), m.group(2)
try:
number_f = float(num_s)
vol_i = int(float(vol_s))
except Exception:
continue
text = item["text"] or f"Том {vol_s}, Глава {num_s}"
chapters.append(Chapter(title=text, url=href, number=number_f, volume=vol_i))
chapters.sort(key=lambda c: (c.volume, c.number))
return chapters
except Exception as e:
logger.debug("_chapters_from_dom: {}", e)
return []
def _pub_status_from_api(manga_meta: dict) -> str:
"""Извлекает статус публикации из ответа API манги."""
status = manga_meta.get("status", {})
if isinstance(status, dict):
label = (status.get("label") or "").lower()
if "завершён" in label or "завершен" in label or "complete" in label:
return "completed"
if "продолжает" in label or "ongoing" in label or "выпускает" in label:
return "ongoing"
return "unknown"
async def _navigate(page: Page, url: str, retries: int = 3,
referer: str | None = None) -> bool:
if referer is None:
p = urlparse(url)
referer = f"{p.scheme}://{p.netloc}/"
for attempt in range(1, retries + 1):
try:
resp = await page.goto(url, wait_until="domcontentloaded",
timeout=60_000, referer=referer)
if resp and resp.status >= 400:
logger.warning("Попытка {}/{}: HTTP {}", attempt, retries, resp.status)
await asyncio.sleep(3 * attempt)
continue
try:
await page.wait_for_load_state("networkidle", timeout=15_000)
except Exception:
pass
return True
except Exception as e:
logger.warning("Попытка {}/{}: {}", attempt, retries, e)
await asyncio.sleep(3 * attempt)
return False
async def _extract_title(page: Page) -> str:
try:
result = await page.evaluate("""
() => {
if (window.__DATA__ && window.__DATA__.manga) {
const m = window.__DATA__.manga;
return m.rus_name || m.name || '';
}
const selectors = [
'.media-name__main',
'.manga-name h1',
'h1.media-title',
'h1.page-title',
'h1',
];
for (const sel of selectors) {
const el = document.querySelector(sel);
if (el && el.textContent.trim()) return el.textContent.trim();
}
return '';
}
""")
return (result or "").strip()
except Exception:
return ""
async def _extract_pub_status(page: Page) -> str:
try:
result = await page.evaluate("""
() => {
if (window.__DATA__ && window.__DATA__.manga && window.__DATA__.manga.status) {
const s = window.__DATA__.manga.status;
const label = (s.label || s.name || '').toLowerCase();
if (label.includes('завершён') || label.includes('завершен') || label.includes('complete')) return 'completed';
if (label.includes('продолжает') || label.includes('ongoing') || label.includes('выпускает')) return 'ongoing';
}
const selectors = [
'.media-info-item__status',
'.status-value',
'[class*="status"] .value',
'[class*="status"]',
];
for (const sel of selectors) {
const el = document.querySelector(sel);
if (!el) continue;
const t = el.textContent.toLowerCase();
if (t.includes('завершён') || t.includes('завершен') || t.includes('complete')) return 'completed';
if (t.includes('продолжает') || t.includes('ongoing')) return 'ongoing';
}
return 'unknown';
}
""")
return result or "unknown"
except Exception:
return "unknown"
async def _extract_description(page: Page) -> str:
try:
result = await page.evaluate("""
() => {
if (window.__DATA__ && window.__DATA__.manga && window.__DATA__.manga.summary) {
return window.__DATA__.manga.summary;
}
const selectors = [
'.media-description__text',
'.description-text',
'.manga-description',
'[class*="description"] p',
];
for (const sel of selectors) {
const el = document.querySelector(sel);
if (el && el.textContent.trim()) return el.textContent.trim();
}
return '';
}
""")
return (result or "").strip()[:2000]
except Exception:
return ""
async def _extract_genres(page: Page) -> list[str]:
try:
result = await page.evaluate("""
() => {
if (window.__DATA__ && window.__DATA__.manga && window.__DATA__.manga.genres) {
return window.__DATA__.manga.genres.map(g => g.name || g.label || '').filter(Boolean);
}
const selectors = [
'.genre-list a',
'.media-tags a',
'.tags a',
'[class*="genre"] a',
];
for (const sel of selectors) {
const els = document.querySelectorAll(sel);
if (els.length) return Array.from(els).map(e => e.textContent.trim()).filter(Boolean);
}
return [];
}
""")
return result or []
except Exception:
return []
async def _detect_server(page: Page, servers_list: list[str]) -> str:
"""Определяет CDN-сервер из img src на странице или из constants API."""
try:
imgs = await page.evaluate("""() =>
Array.from(document.querySelectorAll('img')).map(i => i.src)
.filter(s => s && /https?:\\/\\//.test(s) && /\\.(png|jpg|webp)/i.test(s))
""")
for img_src in imgs:
m = re.match(r"(https?://[^/]+)", img_src)
if m:
srv = m.group(1)
if any(pat in srv for pat in ["mixlib", "imglib", "imgslib"]):
return srv
except Exception:
pass
if servers_list:
return servers_list[0]
return "https://img3.mixlib.me"
async def _api_fetch(page: Page, url: str, referer: str = "") -> bytes | None:
"""
Скачивает изображение через Playwright APIRequestContext.
Разделяет cookies с браузерным контекстом, не ограничен CORS.
"""
try:
headers: dict = {"Accept": "image/png,image/jpeg,image/webp,image/*,*/*"}
if referer:
headers["Referer"] = referer
response = await page.context.request.get(url, headers=headers)
if response.ok:
body = await response.body()
return body if len(body) > 500 else None
except Exception:
pass
return None
def _get_ext(url: str) -> str:
m = re.search(r"\.(jpg|jpeg|png|webp)(\?|$)", url, re.IGNORECASE)
if m:
ext = m.group(1).lower()
return ".jpg" if ext == "jpeg" else f".{ext}"
return ".jpg"
def _base_url(url: str) -> str:
m = re.match(r"(https?://[^/]+)", url)
return m.group(1) if m else "https://mangalib.me"

View File

@@ -20,6 +20,15 @@ _DEFAULT_READMANGA_DOMAINS = [
"3.readmanga.ru",
]
# Домены MangaLib по умолчанию (сидинг при первом запуске)
_DEFAULT_MANGALIB_DOMAINS = [
"mangalib.me",
"mangalib.org",
"hentailib.me",
"yaoilib.me",
"readlib.net",
]
_ALLOWED_FMTS = frozenset({"cbz", "pdf", "epub"})
@@ -197,6 +206,24 @@ class StateDB:
self.conn.commit()
logger.info("Сидинг доменов ReadManga: {} доменов", len(_DEFAULT_READMANGA_DOMAINS))
# Сидинг доменов MangaLib при первом запуске
ml = self.conn.execute("SELECT id FROM sources WHERE slug='mangalib'").fetchone()
if ml:
count = self.conn.execute(
"SELECT COUNT(*) FROM source_domains WHERE source_id=?", (ml["id"],)
).fetchone()[0]
if count == 0:
for domain in _DEFAULT_MANGALIB_DOMAINS:
try:
self.conn.execute(
"INSERT INTO source_domains (source_id, domain) VALUES (?,?)",
(ml["id"], domain)
)
except Exception:
pass
self.conn.commit()
logger.info("Сидинг доменов MangaLib: {} доменов", len(_DEFAULT_MANGALIB_DOMAINS))
# Логируем источники в БД без кода (не в реестре)
known_slugs = set(registry.all_slugs())
db_slugs = [r["slug"] for r in self.conn.execute("SELECT slug FROM sources").fetchall()]