commit 8ca338ce46c51392f340f14fb3aa859902512146 Author: UnknownMp Date: Mon Jun 9 17:36:34 2025 +0800 第一次提交 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c4802a5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/* +dev diff --git a/main.py b/main.py new file mode 100644 index 0000000..1b04f41 --- /dev/null +++ b/main.py @@ -0,0 +1,565 @@ +import os +import time +import random +import logging +import uuid +import inspect +import copy +import re +from urllib import parse as uparse +from datetime import datetime +from typing import Optional, List, Any, Tuple, Dict, Callable +from contextlib import asynccontextmanager +from enum import Enum + +import ujson + +from fastapi import FastAPI, HTTPException, Request, Response, File, Form, Query, UploadFile, Depends, Path +from fastapi.responses import HTMLResponse, JSONResponse +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, create_model +from pydantic.dataclasses import dataclass +from lxml import html +from httpx import AsyncClient, TimeoutException +from httpx_retries import RetryTransport, Retry +from urllib.parse import urlparse +import httpx + +import uvicorn + +CONFIG: Dict[str, str] = { + 'http_proxy': 'http://10.0.0.3:20171/', + 'dev': os.path.isfile('./dev') +} + +logging.basicConfig(level=logging.WARNING) + +logger = logging.getLogger("cache") +logger.setLevel(logging.DEBUG if CONFIG['dev'] else logging.WARNING) + +logger = logging.getLogger("network") +logger.setLevel(logging.DEBUG if CONFIG['dev'] else logging.WARNING) + +logger = logging.getLogger("main") +logger.setLevel(logging.DEBUG if CONFIG['dev'] else logging.INFO) + +class Exceptions: + class WorkNotFound(Exception): + def __init__(self, wid: int, cid: Optional[int] = None, *args: Any): + super().__init__(*args) + self.work_id = wid + self.chapter_id = cid + class NetworkTimeout(Exception): + def __init__(self, endpoint, *args: Any): + super().__init__(*args) + self.endpoint = endpoint + class Unauthorized(Exception): + def __init__(self, endpoint, *args: Any): + super().__init__(*args) + self.endpoint = endpoint + +class RE: + author_pseuds = re.compile(r"^/users/([^/]+)/pseuds/([^/]+)$") + chapter_work = re.compile(r"/works/(\d+)/chapters/(\d+)") +@dataclass +class Pair: + left: Any + right: Any + +class Util: + @staticmethod + def cleanInt(string: str) -> int: + digits_only = ''.join(filter(str.isdigit, string)) + number = int(digits_only) + if string and string[0] == '-': + number = -number + return number + + @staticmethod + def html_to_text_with_newlines(stree: html.HtmlElement) -> str: + tree = copy.deepcopy(stree) + for br in tree.xpath('//br'): + if br.tail: + br.tail = '\n' + br.tail + else: + br.tail = '\n' + + for p in tree.xpath('//p'): + if p.tail: + p.tail = '\n\n' + p.tail + else: + p.tail = '\n\n' + return tree.text_content() + + @staticmethod + def split_lines(tree) -> list[str]: + return filter(lambda x: bool(x), tree.split('\n')) + + class AsyncCache: + def __init__(self, fallback: Callable[[str], float] = None, expire: Optional[int] = None): + self.expire = expire + self.cache: dict[Pair] = {} + self.fallback = fallback + self.logger = logging.getLogger("cache") + + async def __call__(self, key: str) -> Any: return await self.get(key) + async def get(self, key: str) -> Any: + if key in self.cache: + item = self.cache[key] + if self.expire: + if self.expire < 0: + self.logger.debug(f'Miss {key} skip') + return await self.resolve(key) + if time.time() - item.right > self.expire: + self.logger.debug(f'Miss {key} expired') + return await self.resolve(key) + else: + self.logger.debug(f'Hit {key}') + return item.left + else: + self.logger.debug(f'Hit {key} skip') + return item.left + else: + self.logger.debug(f'Miss {key} notfound') + return await self.resolve(key) + def set(self, key: str, value: Any): + self.logger.debug(f'Set {key}') + self.cache[key] = Pair(value, time.time()) + async def resolve(self, key: str) -> Any: + self.logger.debug(f'Resolve {key}') + if self.fallback: + st = time.time() + value = self.fallback(key) + if inspect.isawaitable(value): + self.logger.debug(f'Resolve {key} await') + value = await value + et = time.time() + self.set(key, value) + self.logger.debug(f'Resolve {key} done in {et-st:.4f}') + return value + else: + self.logger.warning(f'No fallback function') + raise KeyError(key) + +class Category(Enum): + FF = "ff" # GL + FM = "fm" # BG + MM = "mm" # BL + NONE = "none" + MULTI = "multi" + UNKNOWN = "unknown" + + @classmethod + def parse(cls, typ: str) -> "Category": + match typ.strip(): + case 'F/F': return cls.FF + case 'F/M': return cls.FM + case 'M/M': return cls.MM + case 'Gen': return cls.NONE + case 'Multi': return cls.MULTI + case _: return cls.UNKNOWN + +@dataclass +class AO3Time: + year: int + month: int + date: int + + _mon_name = { + "jan": 1, + "feb": 2, + "mar": 3, + "apr": 4, + "may": 5, + "jun": 6, + "jul": 7, + "aug": 8, + "sept": 9, + "oct": 10, + "nov": 11, + "dec": 12, + } + + @classmethod + def parse(cls, string: str) -> "AO3Time": + year, month, date = string.strip().split('-',2) + return cls( + year=int(year), + month=int(month), + date=int(date), + ) + + @classmethod + def parse1(cls, string: str) -> "AO3Time": + date, month, year = string.strip().split(' ',2) + return cls( + year=int(year), + month=cls._mon_name.get(month.lower(),-1), + date=int(date), + ) + + def __repr__(self) -> str: return f'{self.year}-{self.month}-{self.date}' + +@dataclass +class AO3WorkStat: + publishedTime: AO3Time + wordCount: int + hitCount: int + kudoCount: Optional[int] = None + commentCount: Optional[int] = None + bookmarkCount: Optional[int] = None + chapter: Optional[Pair] = None + updatedTime: Optional[AO3Time] = None + completedTime: Optional[AO3Time] = None + +@dataclass +class WorkDataResult: + @dataclass + class ChapterItem: + title: str + chapterId: int + workId: int + chapterId: Optional[int] + title: str + text: list[str] + pseud: str + lang: str + stats: AO3WorkStat + summary: Optional[str] = None + fandom: Optional[list[str]] = None + category: Optional[list[Category]] = None + relationship: Optional[list[str]] = None + additionalTags: Optional[list[str]] = None + code: int = 0 + chapters: Optional[list[ChapterItem]] = None + chapterIndex: Optional[int] = None + +@dataclass +class SearchWorkItem: + workId: int + title: str + pseud: str + author: str + summary: str + stats: AO3WorkStat + giftTo: Optional[str] = None + + +@dataclass +class SimpleSearchWorkResult: + keyword: str + count: int + pageCount: int + page: int + works: list[SearchWorkItem] + code: int = 0 + +class Network: + def __init__(self, proxy: Optional[str] = None) -> None: + self.logger = logging.getLogger("network") + self.http_proxy: Optional[str] = proxy + self.client = AsyncClient( + proxy=self.http_proxy, + transport=RetryTransport(retry=Retry( + total=5, + backoff_factor=0.5 + )) + ) + self.xmpp_client = httpx.AsyncClient(timeout=10) + self._get = Util.AsyncCache(self._real_get, 36000000 if CONFIG['dev'] else 1800) + self.works = {} + + async def send_message(self, msg: str): + url = "http://10.0.0.3:52222/send" + headers = {"Content-Type": "application/json"} + payload = { + "alias": "ao3mirror", + "message": msg + } + try: + response = await self.xmpp_client.post(url, headers=headers, json=payload) + response.raise_for_status() + except httpx.HTTPError as e: self.logger.error(e) + + async def _real_get(self, uri: str) -> httpx.Response: + url = urlparse(uri) + try: + self.logger.debug(f'HTTP Get {uri}') + response = await self.client.get(uri, headers=self._build_headers(url.scheme, url.netloc)) + except httpx.TimeoutException as e: + raise Exceptions.NetworkTimeout(uri) from e + response.raise_for_status() + return response + + def _build_headers(self, scheme: str, host: str) -> Dict[str, str]: + return { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:137.0) Gecko/20100101 Firefox/137.0", + "Referer": f"{scheme}://{host}/", + "Origin": f"{scheme}://{host}" + } + + async def _get_page_data(self, work_id: int, chapter_id: Optional[int] = None) -> WorkDataResult: + url = ( + f'https://archiveofourown.org/works/{work_id}/chapters/{chapter_id}?view_adult=yes' if chapter_id + else f'https://archiveofourown.org/works/{work_id}?view_adult=yes' + ) + try: response = await self._get(url) + except httpx.HTTPStatusError as e: + if e.response.status_code == 404: raise Exceptions.WorkNotFound(work_id, chapter_id) + elif e.response.status_code == 302: + newurl = e.response.headers.get('location') + if (result := RE.chapter_work.match(newurl)) and (ids := result.groups()): + work_id, chapter_id = ids + self.logger.info(f'Work {work_id} has first chapter {chapter_id}') + try: + work_id = Util.cleanInt(work_id) + chapter_id = Util.cleanInt(chapter_id) + except ValueError as e1: raise e1 from e + return await self.get_page_data(work_id, chapter_id) + elif newurl.endswith('/users/login?restricted=true'): raise Exceptions.Unauthorized(url) + else: + logger.debug(f'Unknown 302 to: {e.response.headers.get("location")}') + raise + else: + self.logger.debug(f'Unknown code: {e.response.status_code}') + raise + return self.parse_page_data(response.text, work_id, chapter_id) + + async def get_page_data(self, work_id: int, chapter_id: Optional[int] = None) -> WorkDataResult: + wid = (work_id, chapter_id) + if wid in self.works: + self.logger.debug(f"Work {work_id}:{chapter_id} hit cache") + return self.works[wid] + else: + self.logger.debug(f"Work {work_id}:{chapter_id} miss cache") + result = await self._get_page_data(work_id, chapter_id) + self.works[wid] = result + return result + + def parse_page_data(self, result: str, work_id: int, chapter_id: Optional[int]) -> WorkDataResult: + tree = html.fromstring(result) + meta_block = tree.cssselect('div.wrapper')[0].cssselect('dl.work')[0] + stats_block = meta_block.cssselect('dl.stats')[0] + + if (tblock := stats_block.cssselect('dt.status')) and (dblock := stats_block.cssselect('dd.status')): + tblock = tblock[0] + dblock = dblock[0] + match tblock.text_content().strip().lower(): + case 'completed:': + completedTime = AO3Time.parse(dblock.text_content()) + updatedTime = None + case 'updated:': + completedTime = None + updatedTime = AO3Time.parse(dblock.text_content()) + case _: + completedTime = None + updatedTime = None + else: + completedTime = None + updatedTime = None + + if block := meta_block.cssselect('dd.category'): + categories = [Category.parse(item.text_content().strip()) for item in block[0].cssselect('li a')] + else: categories = [] + + if block := meta_block.cssselect('dd.fandom'): + fandoms = [item.text_content().strip() for item in block[0].cssselect('li a')] + else: fandoms = [] + + if block := meta_block.cssselect('dd.relationship'): + relationships = [item.text_content().strip() for item in block[0].cssselect('li a')] + else: relationships = [] + + if block := meta_block.cssselect('dd.freeform.tags'): + additionalTags = [item.text_content().strip() for item in block[0].cssselect('li a')] + else: additionalTags = [] + + body_block = tree.xpath('//*[@id="workskin"]')[0] + preface_block = body_block.cssselect('.preface')[0] + + if (chapter_block := meta_block.cssselect('dd.chapters')): + chapter_block = chapter_block[0] + left, right = chapter_block.text_content().split('/') + right = right.strip() + if right == '?': right = -1 + else: right = Util.cleanInt(right) + chapter = Pair(Util.cleanInt(left), right) + else: chapter = None + + text = [] + for p in body_block.cssselect('div.userstuff p'): + text.extend(Util.split_lines(Util.html_to_text_with_newlines(p))) + + if chapter_id: + chapters = [] + chapterIndex = 0 + index = 0 + if chapter_block := tree.cssselect('div.work ul.work.navigation.actions li.chapter ul#chapter_index.expandable.secondary li form'): + for chapter_option in chapter_block[0].cssselect('select#selected_id')[0].cssselect('option'): + title = chapter_option.text_content().split('.')[1].strip() + cchapter_id = int(chapter_option.attrib['value']) + chapters.append(WorkDataResult.ChapterItem(title, cchapter_id)) + if chapter_id == cchapter_id: chapterIndex = index + index += 1 + else: chapters.append(WorkDataResult.ChapterItem(tree.cssselect('div#workskin div#chapters h3.title')[0].text_content().split(':',1)[1].strip(), chapter_id)) + else: + chapters = chapterIndex = None + + summary = ( + Util.html_to_text_with_newlines(summary_block[0]).strip() + if (summary_block := tree.cssselect('blockquote.userstuff')) else None + ) + + return WorkDataResult( + workId=work_id, chapterId=chapter_id, chapterIndex=chapterIndex, + title=preface_block.cssselect("h2.title")[0].text_content().strip(), + summary=summary, text=text, + stats=AO3WorkStat( + publishedTime=AO3Time.parse(stats_block.cssselect('dd.published')[0].text_content()), + completedTime = completedTime, updatedTime = updatedTime, + wordCount=Util.cleanInt(stats_block.cssselect('dd.words')[0].text_content()), + #kudoCount=Util.cleanInt(stats_block.cssselect('dd.kudos')[0].text_content()), + hitCount=Util.cleanInt(stats_block.cssselect('dd.hits')[0].text_content()), + chapter=chapter + ), + category=categories, fandom=fandoms, + relationship=relationships, additionalTags = additionalTags, + lang=meta_block.cssselect('dd.language')[0].text_content().strip(), + pseud=preface_block.cssselect('.byline')[0].text_content().strip(), + chapters=chapters + ) + + async def search_works(self, keyword: str, page: int = 1) -> SimpleSearchWorkResult: + page = abs(page) + url = f"https://archiveofourown.org/works/search?work_search%5Bquery%5D={uparse.quote(keyword)}" if page in (0, 1) else \ + f"https://archiveofourown.org/works/search?work_search%5Bquery%5D={uparse.quote(keyword)}&page={page}" + response = await self._get(url) + return self.parse_search_result(response.text, keyword, page) + + def parse_search_result(self, result: str, keyword: str, page: int = 1) -> SimpleSearchWorkResult: + tree = html.fromstring(result) + count_block = tree.cssselect('h3.heading')[0].text_content() + if 'Found' in count_block: + count = Util.cleanInt(count_block.split('Found', 1)[0]) + work_blocks = tree.cssselect('ol.work.index.group > li') + works = [] + for block in work_blocks: + ass = block.cssselect('div.header.module > h4.heading > a') + match len(ass): + case 1: b1 = ass[0]; b2 = None; b3 = None + case 2: b1, b2 = ass; b3 = None + case 3: + b1, b2, b3 = ass + b3 = b3.attrib['href'].split('/')[-2] + case _: continue + if b1 is not None: work_id = int(b1.attrib['href'].split('/')[-1]) + else: continue + if b2 is not None and (match := RE.author_pseuds.match(b2.attrib['href'])): + username, pseud = match.groups() + else: username = pseud = '' + datetime_block = block.cssselect('div.header.module > p.datetime')[0] + summary = Util.html_to_text_with_newlines(summary_block[0]) if (summary_block := block.cssselect('blockquote.userstuff')) else '' + kudoCount = Util.cleanInt(kudo_block[0].text_content()) if (kudo_block := block.cssselect('dd.kudos')) else None + stats_block = block.cssselect('dl.stats')[0] + works.append(SearchWorkItem( + workId=work_id, + title=b1.text_content().strip(), + author=username, pseud=pseud, giftTo=b3, + summary=summary.strip(), + stats=AO3WorkStat( + publishedTime=AO3Time.parse1(datetime_block.text_content()), + wordCount=Util.cleanInt(stats_block.cssselect('dd.words')[0].text_content()), + kudoCount=kudoCount, + hitCount=Util.cleanInt(stats_block.cssselect('dd.hits')[0].text_content()), + ), + )) + page_count = Util.cleanInt(next_block[0].getprevious().text_content()) \ + if (page_block := tree.cssselect('ol.pagination.actions')) and (next_block := page_block[0].cssselect('li.next')) else 1 + return SimpleSearchWorkResult( + keyword=keyword, count=count, pageCount=page_count, page=page, works=works + ) + else: + return SimpleSearchWorkResult( + code=1, keyword=keyword, count=-1, pageCount=-1, page=page, works=[] + ) + +network: Optional[Network] = None + +async def startup() -> None: + global network + logger.info("Create httpx async client.") + network = Network(CONFIG['http_proxy']) + +async def shutdown() -> None: + logger.info("Shutdown httpx async client.") + + +@asynccontextmanager +async def lifespan(app: FastAPI): + await startup() + yield + await shutdown() + +def require_network() -> Optional[Network]: + if network is None: raise HTTPException(status_code=503, detail="Network client not initialized.") + return network + +app = FastAPI( + lifespan=lifespan, +) + +@app.exception_handler(Exceptions.NetworkTimeout) +async def network_timeout_handler(response: Response, exc: Exceptions.NetworkTimeout): + return JSONResponse(status_code=504, content={"code": 1, "endpoint": exc.endpoint}) + +@app.exception_handler(httpx.HTTPStatusError) +async def http_status_error_handler(response: Response, exc: httpx.HTTPStatusError): + return JSONResponse(status_code=502, content={"code": 2, 'endpoint': str(exc.request.url),'status': exc.response.status_code}) + +@app.exception_handler(Exceptions.Unauthorized) +async def http_status_error_handler(response: Response, exc: Exceptions.Unauthorized): + return JSONResponse(status_code=401, content={"code": 1, 'endpoint': exc.endpoint}) + +@app.get("/search/simple") +async def search_work_simple( + network: Network = Depends(require_network), + keyword: str = Query(...), + page: int = Query(1) +) -> SimpleSearchWorkResult: + result = await network.search_works(keyword, page) + await network.send_message(f'[Info] 简单搜索\n关键词: {keyword}\n总数: {result.count}\n页面: {result.page}/{result.pageCount}') + logger.info(f'Simple Search {keyword} count {result.count} page {result.page}/{result.pageCount}') + return result + +@app.get("/work/{work_id}") +async def get_work( + network: Network = Depends(require_network), + work_id: int = Path(..., description="Work ID"), +) -> WorkDataResult: + try: + result = await network.get_page_data(work_id) + await network.send_message(f'[Info] 作品\nID: {result.workId}\n作者: {result.pseud}\n标题: {result.title}\n发布于: {result.stats.publishedTime}') + logger.info(f'Work {work_id} title: {result.title}') + except Exceptions.WorkNotFound as e: + logger.warning(f"Work not found: {e.work_id}") + raise JSONResponse(status_code=404, content={"code": 1, "work_id": e.work_id}) + else: return result + +@app.get("/work/{work_id}/{chapter_id}") +async def get_work_chapter( + network: Network = Depends(require_network), + work_id: int = Path(..., description="Work ID"), + chapter_id: int = Path(..., description="Chapter ID"), +) -> WorkDataResult: + try: + result = await network.get_page_data(work_id, chapter_id) + await network.send_message(f'[Info] 作品\nID: {result.workId}:{result.chapterId}\n作者: {result.pseud}\n标题: {result.title}\n发布于: {result.stats.publishedTime}') + logger.info(f'Work {work_id}:{chapter_id} title: {result.title}') + except Exceptions.WorkNotFound as e: + logger.warning(f"Work not found: {e.work_id} {e.chapter_id}") + return JSONResponse(status_code=404, content={"code": 1, "work_id": e.work_id, 'chapter_id': chapter_id}) + else: return result + +if __name__ == "__main__": + uvicorn.run("main:app", host="0.0.0.0", port=28001, log_level="debug" if CONFIG['dev'] else 'info', reload=CONFIG['dev']) +