init
This commit is contained in:
364
src/scraper.py
Normal file
364
src/scraper.py
Normal file
@@ -0,0 +1,364 @@
|
||||
"""
|
||||
Парсер readmanga.ru: список глав и URL/байты изображений внутри главы.
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
from playwright.async_api import Page
|
||||
|
||||
from .browser import BrowserManager
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Модели данных
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class Chapter:
|
||||
title: str
|
||||
url: str
|
||||
number: float = 0.0
|
||||
volume: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class MangaInfo:
|
||||
title: str
|
||||
url: str
|
||||
chapters: list[Chapter] = field(default_factory=list)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Страница манги — список глав
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def get_manga_info(page: Page, url: str) -> Optional[MangaInfo]:
|
||||
"""Открывает страницу манги и возвращает список всех глав."""
|
||||
logger.info("Загружаем страницу манги: {}", url)
|
||||
ok = await _navigate(page, url)
|
||||
if not ok:
|
||||
return None
|
||||
|
||||
title = await page.title()
|
||||
title = re.sub(r"\s*[-–|].*$", "", title).strip()
|
||||
logger.info("Манга: {}", title)
|
||||
|
||||
await _expand_chapters(page)
|
||||
chapters = await _extract_chapters(page)
|
||||
if not chapters:
|
||||
chapters = await _extract_chapters_alt(page)
|
||||
|
||||
logger.info("Найдено глав: {}", len(chapters))
|
||||
return MangaInfo(title=title, url=url, chapters=chapters)
|
||||
|
||||
|
||||
async def _navigate(page: Page, url: str, retries: int = 3,
|
||||
referer: str | None = None) -> bool:
|
||||
from urllib.parse import urlparse
|
||||
if referer is None:
|
||||
p = urlparse(url)
|
||||
referer = f"{p.scheme}://{p.netloc}/"
|
||||
for attempt in range(1, retries + 1):
|
||||
try:
|
||||
resp = await page.goto(url, wait_until="domcontentloaded",
|
||||
timeout=60_000, referer=referer)
|
||||
if resp and resp.status >= 400:
|
||||
logger.warning("Попытка {}/{}: HTTP {}", attempt, retries, resp.status)
|
||||
await asyncio.sleep(3 * attempt)
|
||||
continue
|
||||
try:
|
||||
await page.wait_for_load_state("networkidle", timeout=10_000)
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning("Попытка {}/{}: {}", attempt, retries, e)
|
||||
await asyncio.sleep(3 * attempt)
|
||||
return False
|
||||
|
||||
|
||||
async def _expand_chapters(page: Page):
|
||||
for sel in ["a.chapter-link.all", "button:has-text('Все главы')",
|
||||
"a:has-text('Все главы')"]:
|
||||
try:
|
||||
el = page.locator(sel).first
|
||||
if await el.is_visible(timeout=2000):
|
||||
await el.click()
|
||||
await page.wait_for_load_state("networkidle", timeout=10_000)
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
async def _extract_chapters(page: Page) -> list[Chapter]:
|
||||
"""Основной парсер: #chapters-list → tr.item-row → td[data-num] a.chapter-link"""
|
||||
rows = await page.query_selector_all("#chapters-list tr.item-row")
|
||||
chapters = []
|
||||
for row in rows:
|
||||
link = await row.query_selector("td[class*='item-title'] a")
|
||||
if not link:
|
||||
continue
|
||||
href = await link.get_attribute("href") or ""
|
||||
text = (await link.inner_text()).strip()
|
||||
if not href:
|
||||
continue
|
||||
td = await row.query_selector("td[data-num]")
|
||||
vol = int((await td.get_attribute("data-vol") or "0")) if td else 0
|
||||
num_raw = int((await td.get_attribute("data-num") or "0")) if td else 0
|
||||
number = num_raw / 10.0
|
||||
full_url = href if href.startswith("http") else _base_url(page.url) + href
|
||||
chapters.append(Chapter(title=text, url=full_url, number=number, volume=vol))
|
||||
return chapters
|
||||
|
||||
|
||||
async def _extract_chapters_alt(page: Page) -> list[Chapter]:
|
||||
result = await page.evaluate("""
|
||||
() => {
|
||||
const links = Array.from(document.querySelectorAll('a[href*="/vol"]'));
|
||||
return links.map(a => ({ href: a.href, text: a.textContent.trim() }))
|
||||
.filter(x => x.href && x.text);
|
||||
}
|
||||
""")
|
||||
return [Chapter(title=x["text"], url=x["href"],
|
||||
number=_parse_num(x["text"]), volume=_parse_vol(x["text"]))
|
||||
for x in result]
|
||||
|
||||
|
||||
def _base_url(url: str) -> str:
|
||||
m = re.match(r"(https?://[^/]+)", url)
|
||||
return m.group(1) if m else "https://readmanga.ru"
|
||||
|
||||
|
||||
def _parse_num(text: str) -> float:
|
||||
m = re.search(r"[\d]+(?:[.,]\d+)?", text.replace(",", "."))
|
||||
return float(m.group()) if m else 0.0
|
||||
|
||||
|
||||
def _parse_vol(text: str) -> int:
|
||||
m = re.search(r"Том\s+(\d+)", text, re.IGNORECASE)
|
||||
return int(m.group(1)) if m else 0
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Страница главы — получение URL изображений
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def _extract_images_from_js(page: Page) -> list[str]:
|
||||
"""
|
||||
Извлекает URL из rm_h.readerInit(chapterInfo, [[base, '', path, w, h], ...]).
|
||||
Считает скобки для точного захвата массива.
|
||||
"""
|
||||
try:
|
||||
result = await page.evaluate("""
|
||||
() => {
|
||||
for (const s of document.querySelectorAll('script')) {
|
||||
const text = s.textContent || '';
|
||||
const mi = text.indexOf('readerInit');
|
||||
if (mi === -1) continue;
|
||||
const ai = text.indexOf('[', mi);
|
||||
if (ai === -1) continue;
|
||||
let depth = 0, end = -1;
|
||||
for (let i = ai; i < text.length; i++) {
|
||||
if (text[i] === '[') depth++;
|
||||
else if (text[i] === ']') { depth--; if (!depth) { end = i+1; break; } }
|
||||
}
|
||||
if (end === -1) continue;
|
||||
try {
|
||||
const arr = eval(text.slice(ai, end));
|
||||
if (Array.isArray(arr) && arr.length)
|
||||
return arr.map(item => Array.isArray(item) && item.length >= 3
|
||||
? item[0] + item[2] : null).filter(Boolean);
|
||||
} catch(e) {}
|
||||
}
|
||||
return [];
|
||||
}
|
||||
""")
|
||||
if result:
|
||||
logger.debug("JS readerInit нашёл {} изображений", len(result))
|
||||
return result or []
|
||||
except Exception as e:
|
||||
logger.debug("JS-метод не сработал: {}", e)
|
||||
return []
|
||||
|
||||
|
||||
async def _extract_images_from_dom(page: Page) -> list[str]:
|
||||
try:
|
||||
result = await page.evaluate("""
|
||||
() => {
|
||||
for (const sel of ['img.manga-page', '.page-image img', '#mangaReader img', 'img[data-src]']) {
|
||||
const found = Array.from(document.querySelectorAll(sel));
|
||||
if (found.length) return found.map(i => i.src || i.dataset.src).filter(Boolean);
|
||||
}
|
||||
return [];
|
||||
}
|
||||
""")
|
||||
return result or []
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _get_ext(url: str) -> str:
|
||||
m = re.search(r"\.(jpg|jpeg|png|webp)(\?|$)", url, re.IGNORECASE)
|
||||
if m:
|
||||
ext = m.group(1).lower()
|
||||
return ".jpg" if ext == "jpeg" else f".{ext}"
|
||||
return ".jpg"
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Скачивание главы
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
async def get_chapter_images_and_download(
|
||||
page: Page,
|
||||
chapter_url: str,
|
||||
dest_dir: Path,
|
||||
manga_url: str | None = None,
|
||||
) -> list[Path]:
|
||||
"""
|
||||
1. Открывает страницу главы (устанавливает DDoS-Guard cookies для CDN).
|
||||
2. Извлекает список URL из readerInit.
|
||||
3. Перехватывает img-запросы через page.route() + route.fetch()
|
||||
(браузерный стек — правильные Sec-Fetch-* заголовки, cookies).
|
||||
4. Пролистывает читалку клавишей ArrowRight чтобы загрузить все страницы.
|
||||
"""
|
||||
logger.info("Загружаем главу: {}", chapter_url)
|
||||
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(chapter_url)
|
||||
parts = parsed.path.strip("/").split("/")
|
||||
manga_slug = parts[0] if parts else ""
|
||||
referer = manga_url or f"{parsed.scheme}://{parsed.netloc}/{manga_slug}"
|
||||
|
||||
load_url = chapter_url + ("?mtr=1" if "?" not in chapter_url else "&mtr=1")
|
||||
dest_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _base(u: str) -> str:
|
||||
return u.split("?")[0]
|
||||
|
||||
# CDN домены которые хостят изображения манги (не статику сайта)
|
||||
CDN_RE = re.compile(r"(?<!\bstatic\b)(^|[./])one-way\.work|staticfa\.|cdnmanga|reimg", re.I)
|
||||
IMG_RE = re.compile(r"\.(jpg|jpeg|png|webp)(\?|$)", re.I)
|
||||
|
||||
# Более точный фильтр: только image-хосты, не resrmr/статика
|
||||
def _is_manga_image(url: str) -> bool:
|
||||
base = _base(url)
|
||||
if not IMG_RE.search(base):
|
||||
return False
|
||||
# Исключаем статику сайта (логотипы, иконки, шрифты)
|
||||
if "resrmr." in url or "/static/" in url:
|
||||
return False
|
||||
# Принимаем image CDN
|
||||
return bool(re.search(r"one-way\.work|staticfa\.|rm\.one-way|cdnmanga|reimg", url, re.I))
|
||||
|
||||
captured: dict[str, bytes] = {} # base_url → bytes
|
||||
lock = asyncio.Lock()
|
||||
|
||||
async def route_handler(route, request):
|
||||
url = request.url
|
||||
base = _base(url)
|
||||
if not _is_manga_image(url):
|
||||
await route.continue_()
|
||||
return
|
||||
# Уже есть — пропускаем
|
||||
async with lock:
|
||||
already = base in captured
|
||||
if already:
|
||||
await route.continue_()
|
||||
return
|
||||
try:
|
||||
response = await route.fetch()
|
||||
body = await response.body()
|
||||
if body and len(body) > 500 and response.status in (200, 206):
|
||||
async with lock:
|
||||
if base not in captured:
|
||||
captured[base] = body
|
||||
logger.debug("✓ {}: {} байт", base.split("/")[-1], len(body))
|
||||
await route.fulfill(response=response)
|
||||
except Exception as e:
|
||||
logger.debug("route.fetch {}: {}", base[-40:], e)
|
||||
try:
|
||||
await route.continue_()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
await page.route("**/*", route_handler)
|
||||
|
||||
# 1. Открываем главу
|
||||
ok = await _navigate(page, load_url, referer=referer)
|
||||
if not ok:
|
||||
await page.unroute("**/*", route_handler)
|
||||
logger.error("Не удалось открыть главу: {}", chapter_url)
|
||||
return []
|
||||
|
||||
# 2. Ждём readerInit
|
||||
try:
|
||||
await page.wait_for_function(
|
||||
"() => Array.from(document.querySelectorAll('script'))"
|
||||
".some(s => s.textContent.includes('readerInit'))",
|
||||
timeout=15_000,
|
||||
)
|
||||
except Exception:
|
||||
logger.debug("readerInit не появился за 15с")
|
||||
|
||||
# 3. Извлекаем список URL
|
||||
image_urls = await _extract_images_from_js(page)
|
||||
if not image_urls:
|
||||
image_urls = await _extract_images_from_dom(page)
|
||||
if not image_urls:
|
||||
await page.unroute("**/*", route_handler)
|
||||
logger.error("Список изображений пуст: {}", chapter_url)
|
||||
return []
|
||||
|
||||
logger.info("Найдено изображений: {}", len(image_urls))
|
||||
url_to_idx = {_base(u): i for i, u in enumerate(image_urls)}
|
||||
total = len(image_urls)
|
||||
|
||||
# 4. Пролистываем читалку — reader грузит страницы по мере листания
|
||||
await asyncio.sleep(1)
|
||||
for i in range(total + 10):
|
||||
async with lock:
|
||||
done = len(captured)
|
||||
if done >= total:
|
||||
break
|
||||
try:
|
||||
await page.keyboard.press("ArrowRight")
|
||||
await asyncio.sleep(0.5)
|
||||
except Exception:
|
||||
break
|
||||
if i % 20 == 19:
|
||||
async with lock:
|
||||
done = len(captured)
|
||||
logger.debug("Пролистано {}, загружено: {}/{}", i + 1, done, total)
|
||||
|
||||
# Финальное ожидание
|
||||
await asyncio.sleep(3)
|
||||
await page.unroute("**/*", route_handler)
|
||||
|
||||
async with lock:
|
||||
done = len(captured)
|
||||
logger.info("Перехвачено: {}/{}", done, total)
|
||||
|
||||
# 5. Сохраняем в правильном порядке
|
||||
paths: dict[int, Path] = {}
|
||||
for base_url, body in captured.items():
|
||||
if base_url not in url_to_idx:
|
||||
continue
|
||||
idx = url_to_idx[base_url]
|
||||
ext = _get_ext(base_url)
|
||||
p = dest_dir / f"{idx:04d}{ext}"
|
||||
p.write_bytes(body)
|
||||
paths[idx] = p
|
||||
|
||||
missing = total - len(paths)
|
||||
if missing:
|
||||
logger.warning("Не загружено страниц: {}", missing)
|
||||
|
||||
return [paths[i] for i in sorted(paths.keys())]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user