第一次提交

This commit is contained in:
2025-06-09 17:36:34 +08:00
commit 8ca338ce46
2 changed files with 567 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
__pycache__/*
dev

565
main.py Normal file
View File

@ -0,0 +1,565 @@
import os
import time
import random
import logging
import uuid
import inspect
import copy
import re
from urllib import parse as uparse
from datetime import datetime
from typing import Optional, List, Any, Tuple, Dict, Callable
from contextlib import asynccontextmanager
from enum import Enum
import ujson
from fastapi import FastAPI, HTTPException, Request, Response, File, Form, Query, UploadFile, Depends, Path
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, create_model
from pydantic.dataclasses import dataclass
from lxml import html
from httpx import AsyncClient, TimeoutException
from httpx_retries import RetryTransport, Retry
from urllib.parse import urlparse
import httpx
import uvicorn
CONFIG: Dict[str, str] = {
'http_proxy': 'http://10.0.0.3:20171/',
'dev': os.path.isfile('./dev')
}
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger("cache")
logger.setLevel(logging.DEBUG if CONFIG['dev'] else logging.WARNING)
logger = logging.getLogger("network")
logger.setLevel(logging.DEBUG if CONFIG['dev'] else logging.WARNING)
logger = logging.getLogger("main")
logger.setLevel(logging.DEBUG if CONFIG['dev'] else logging.INFO)
class Exceptions:
class WorkNotFound(Exception):
def __init__(self, wid: int, cid: Optional[int] = None, *args: Any):
super().__init__(*args)
self.work_id = wid
self.chapter_id = cid
class NetworkTimeout(Exception):
def __init__(self, endpoint, *args: Any):
super().__init__(*args)
self.endpoint = endpoint
class Unauthorized(Exception):
def __init__(self, endpoint, *args: Any):
super().__init__(*args)
self.endpoint = endpoint
class RE:
author_pseuds = re.compile(r"^/users/([^/]+)/pseuds/([^/]+)$")
chapter_work = re.compile(r"/works/(\d+)/chapters/(\d+)")
@dataclass
class Pair:
left: Any
right: Any
class Util:
@staticmethod
def cleanInt(string: str) -> int:
digits_only = ''.join(filter(str.isdigit, string))
number = int(digits_only)
if string and string[0] == '-':
number = -number
return number
@staticmethod
def html_to_text_with_newlines(stree: html.HtmlElement) -> str:
tree = copy.deepcopy(stree)
for br in tree.xpath('//br'):
if br.tail:
br.tail = '\n' + br.tail
else:
br.tail = '\n'
for p in tree.xpath('//p'):
if p.tail:
p.tail = '\n\n' + p.tail
else:
p.tail = '\n\n'
return tree.text_content()
@staticmethod
def split_lines(tree) -> list[str]:
return filter(lambda x: bool(x), tree.split('\n'))
class AsyncCache:
def __init__(self, fallback: Callable[[str], float] = None, expire: Optional[int] = None):
self.expire = expire
self.cache: dict[Pair] = {}
self.fallback = fallback
self.logger = logging.getLogger("cache")
async def __call__(self, key: str) -> Any: return await self.get(key)
async def get(self, key: str) -> Any:
if key in self.cache:
item = self.cache[key]
if self.expire:
if self.expire < 0:
self.logger.debug(f'Miss {key} skip')
return await self.resolve(key)
if time.time() - item.right > self.expire:
self.logger.debug(f'Miss {key} expired')
return await self.resolve(key)
else:
self.logger.debug(f'Hit {key}')
return item.left
else:
self.logger.debug(f'Hit {key} skip')
return item.left
else:
self.logger.debug(f'Miss {key} notfound')
return await self.resolve(key)
def set(self, key: str, value: Any):
self.logger.debug(f'Set {key}')
self.cache[key] = Pair(value, time.time())
async def resolve(self, key: str) -> Any:
self.logger.debug(f'Resolve {key}')
if self.fallback:
st = time.time()
value = self.fallback(key)
if inspect.isawaitable(value):
self.logger.debug(f'Resolve {key} await')
value = await value
et = time.time()
self.set(key, value)
self.logger.debug(f'Resolve {key} done in {et-st:.4f}')
return value
else:
self.logger.warning(f'No fallback function')
raise KeyError(key)
class Category(Enum):
FF = "ff" # GL
FM = "fm" # BG
MM = "mm" # BL
NONE = "none"
MULTI = "multi"
UNKNOWN = "unknown"
@classmethod
def parse(cls, typ: str) -> "Category":
match typ.strip():
case 'F/F': return cls.FF
case 'F/M': return cls.FM
case 'M/M': return cls.MM
case 'Gen': return cls.NONE
case 'Multi': return cls.MULTI
case _: return cls.UNKNOWN
@dataclass
class AO3Time:
year: int
month: int
date: int
_mon_name = {
"jan": 1,
"feb": 2,
"mar": 3,
"apr": 4,
"may": 5,
"jun": 6,
"jul": 7,
"aug": 8,
"sept": 9,
"oct": 10,
"nov": 11,
"dec": 12,
}
@classmethod
def parse(cls, string: str) -> "AO3Time":
year, month, date = string.strip().split('-',2)
return cls(
year=int(year),
month=int(month),
date=int(date),
)
@classmethod
def parse1(cls, string: str) -> "AO3Time":
date, month, year = string.strip().split(' ',2)
return cls(
year=int(year),
month=cls._mon_name.get(month.lower(),-1),
date=int(date),
)
def __repr__(self) -> str: return f'{self.year}-{self.month}-{self.date}'
@dataclass
class AO3WorkStat:
publishedTime: AO3Time
wordCount: int
hitCount: int
kudoCount: Optional[int] = None
commentCount: Optional[int] = None
bookmarkCount: Optional[int] = None
chapter: Optional[Pair] = None
updatedTime: Optional[AO3Time] = None
completedTime: Optional[AO3Time] = None
@dataclass
class WorkDataResult:
@dataclass
class ChapterItem:
title: str
chapterId: int
workId: int
chapterId: Optional[int]
title: str
text: list[str]
pseud: str
lang: str
stats: AO3WorkStat
summary: Optional[str] = None
fandom: Optional[list[str]] = None
category: Optional[list[Category]] = None
relationship: Optional[list[str]] = None
additionalTags: Optional[list[str]] = None
code: int = 0
chapters: Optional[list[ChapterItem]] = None
chapterIndex: Optional[int] = None
@dataclass
class SearchWorkItem:
workId: int
title: str
pseud: str
author: str
summary: str
stats: AO3WorkStat
giftTo: Optional[str] = None
@dataclass
class SimpleSearchWorkResult:
keyword: str
count: int
pageCount: int
page: int
works: list[SearchWorkItem]
code: int = 0
class Network:
def __init__(self, proxy: Optional[str] = None) -> None:
self.logger = logging.getLogger("network")
self.http_proxy: Optional[str] = proxy
self.client = AsyncClient(
proxy=self.http_proxy,
transport=RetryTransport(retry=Retry(
total=5,
backoff_factor=0.5
))
)
self.xmpp_client = httpx.AsyncClient(timeout=10)
self._get = Util.AsyncCache(self._real_get, 36000000 if CONFIG['dev'] else 1800)
self.works = {}
async def send_message(self, msg: str):
url = "http://10.0.0.3:52222/send"
headers = {"Content-Type": "application/json"}
payload = {
"alias": "ao3mirror",
"message": msg
}
try:
response = await self.xmpp_client.post(url, headers=headers, json=payload)
response.raise_for_status()
except httpx.HTTPError as e: self.logger.error(e)
async def _real_get(self, uri: str) -> httpx.Response:
url = urlparse(uri)
try:
self.logger.debug(f'HTTP Get {uri}')
response = await self.client.get(uri, headers=self._build_headers(url.scheme, url.netloc))
except httpx.TimeoutException as e:
raise Exceptions.NetworkTimeout(uri) from e
response.raise_for_status()
return response
def _build_headers(self, scheme: str, host: str) -> Dict[str, str]:
return {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:137.0) Gecko/20100101 Firefox/137.0",
"Referer": f"{scheme}://{host}/",
"Origin": f"{scheme}://{host}"
}
async def _get_page_data(self, work_id: int, chapter_id: Optional[int] = None) -> WorkDataResult:
url = (
f'https://archiveofourown.org/works/{work_id}/chapters/{chapter_id}?view_adult=yes' if chapter_id
else f'https://archiveofourown.org/works/{work_id}?view_adult=yes'
)
try: response = await self._get(url)
except httpx.HTTPStatusError as e:
if e.response.status_code == 404: raise Exceptions.WorkNotFound(work_id, chapter_id)
elif e.response.status_code == 302:
newurl = e.response.headers.get('location')
if (result := RE.chapter_work.match(newurl)) and (ids := result.groups()):
work_id, chapter_id = ids
self.logger.info(f'Work {work_id} has first chapter {chapter_id}')
try:
work_id = Util.cleanInt(work_id)
chapter_id = Util.cleanInt(chapter_id)
except ValueError as e1: raise e1 from e
return await self.get_page_data(work_id, chapter_id)
elif newurl.endswith('/users/login?restricted=true'): raise Exceptions.Unauthorized(url)
else:
logger.debug(f'Unknown 302 to: {e.response.headers.get("location")}')
raise
else:
self.logger.debug(f'Unknown code: {e.response.status_code}')
raise
return self.parse_page_data(response.text, work_id, chapter_id)
async def get_page_data(self, work_id: int, chapter_id: Optional[int] = None) -> WorkDataResult:
wid = (work_id, chapter_id)
if wid in self.works:
self.logger.debug(f"Work {work_id}:{chapter_id} hit cache")
return self.works[wid]
else:
self.logger.debug(f"Work {work_id}:{chapter_id} miss cache")
result = await self._get_page_data(work_id, chapter_id)
self.works[wid] = result
return result
def parse_page_data(self, result: str, work_id: int, chapter_id: Optional[int]) -> WorkDataResult:
tree = html.fromstring(result)
meta_block = tree.cssselect('div.wrapper')[0].cssselect('dl.work')[0]
stats_block = meta_block.cssselect('dl.stats')[0]
if (tblock := stats_block.cssselect('dt.status')) and (dblock := stats_block.cssselect('dd.status')):
tblock = tblock[0]
dblock = dblock[0]
match tblock.text_content().strip().lower():
case 'completed:':
completedTime = AO3Time.parse(dblock.text_content())
updatedTime = None
case 'updated:':
completedTime = None
updatedTime = AO3Time.parse(dblock.text_content())
case _:
completedTime = None
updatedTime = None
else:
completedTime = None
updatedTime = None
if block := meta_block.cssselect('dd.category'):
categories = [Category.parse(item.text_content().strip()) for item in block[0].cssselect('li a')]
else: categories = []
if block := meta_block.cssselect('dd.fandom'):
fandoms = [item.text_content().strip() for item in block[0].cssselect('li a')]
else: fandoms = []
if block := meta_block.cssselect('dd.relationship'):
relationships = [item.text_content().strip() for item in block[0].cssselect('li a')]
else: relationships = []
if block := meta_block.cssselect('dd.freeform.tags'):
additionalTags = [item.text_content().strip() for item in block[0].cssselect('li a')]
else: additionalTags = []
body_block = tree.xpath('//*[@id="workskin"]')[0]
preface_block = body_block.cssselect('.preface')[0]
if (chapter_block := meta_block.cssselect('dd.chapters')):
chapter_block = chapter_block[0]
left, right = chapter_block.text_content().split('/')
right = right.strip()
if right == '?': right = -1
else: right = Util.cleanInt(right)
chapter = Pair(Util.cleanInt(left), right)
else: chapter = None
text = []
for p in body_block.cssselect('div.userstuff p'):
text.extend(Util.split_lines(Util.html_to_text_with_newlines(p)))
if chapter_id:
chapters = []
chapterIndex = 0
index = 0
if chapter_block := tree.cssselect('div.work ul.work.navigation.actions li.chapter ul#chapter_index.expandable.secondary li form'):
for chapter_option in chapter_block[0].cssselect('select#selected_id')[0].cssselect('option'):
title = chapter_option.text_content().split('.')[1].strip()
cchapter_id = int(chapter_option.attrib['value'])
chapters.append(WorkDataResult.ChapterItem(title, cchapter_id))
if chapter_id == cchapter_id: chapterIndex = index
index += 1
else: chapters.append(WorkDataResult.ChapterItem(tree.cssselect('div#workskin div#chapters h3.title')[0].text_content().split(':',1)[1].strip(), chapter_id))
else:
chapters = chapterIndex = None
summary = (
Util.html_to_text_with_newlines(summary_block[0]).strip()
if (summary_block := tree.cssselect('blockquote.userstuff')) else None
)
return WorkDataResult(
workId=work_id, chapterId=chapter_id, chapterIndex=chapterIndex,
title=preface_block.cssselect("h2.title")[0].text_content().strip(),
summary=summary, text=text,
stats=AO3WorkStat(
publishedTime=AO3Time.parse(stats_block.cssselect('dd.published')[0].text_content()),
completedTime = completedTime, updatedTime = updatedTime,
wordCount=Util.cleanInt(stats_block.cssselect('dd.words')[0].text_content()),
#kudoCount=Util.cleanInt(stats_block.cssselect('dd.kudos')[0].text_content()),
hitCount=Util.cleanInt(stats_block.cssselect('dd.hits')[0].text_content()),
chapter=chapter
),
category=categories, fandom=fandoms,
relationship=relationships, additionalTags = additionalTags,
lang=meta_block.cssselect('dd.language')[0].text_content().strip(),
pseud=preface_block.cssselect('.byline')[0].text_content().strip(),
chapters=chapters
)
async def search_works(self, keyword: str, page: int = 1) -> SimpleSearchWorkResult:
page = abs(page)
url = f"https://archiveofourown.org/works/search?work_search%5Bquery%5D={uparse.quote(keyword)}" if page in (0, 1) else \
f"https://archiveofourown.org/works/search?work_search%5Bquery%5D={uparse.quote(keyword)}&page={page}"
response = await self._get(url)
return self.parse_search_result(response.text, keyword, page)
def parse_search_result(self, result: str, keyword: str, page: int = 1) -> SimpleSearchWorkResult:
tree = html.fromstring(result)
count_block = tree.cssselect('h3.heading')[0].text_content()
if 'Found' in count_block:
count = Util.cleanInt(count_block.split('Found', 1)[0])
work_blocks = tree.cssselect('ol.work.index.group > li')
works = []
for block in work_blocks:
ass = block.cssselect('div.header.module > h4.heading > a')
match len(ass):
case 1: b1 = ass[0]; b2 = None; b3 = None
case 2: b1, b2 = ass; b3 = None
case 3:
b1, b2, b3 = ass
b3 = b3.attrib['href'].split('/')[-2]
case _: continue
if b1 is not None: work_id = int(b1.attrib['href'].split('/')[-1])
else: continue
if b2 is not None and (match := RE.author_pseuds.match(b2.attrib['href'])):
username, pseud = match.groups()
else: username = pseud = ''
datetime_block = block.cssselect('div.header.module > p.datetime')[0]
summary = Util.html_to_text_with_newlines(summary_block[0]) if (summary_block := block.cssselect('blockquote.userstuff')) else ''
kudoCount = Util.cleanInt(kudo_block[0].text_content()) if (kudo_block := block.cssselect('dd.kudos')) else None
stats_block = block.cssselect('dl.stats')[0]
works.append(SearchWorkItem(
workId=work_id,
title=b1.text_content().strip(),
author=username, pseud=pseud, giftTo=b3,
summary=summary.strip(),
stats=AO3WorkStat(
publishedTime=AO3Time.parse1(datetime_block.text_content()),
wordCount=Util.cleanInt(stats_block.cssselect('dd.words')[0].text_content()),
kudoCount=kudoCount,
hitCount=Util.cleanInt(stats_block.cssselect('dd.hits')[0].text_content()),
),
))
page_count = Util.cleanInt(next_block[0].getprevious().text_content()) \
if (page_block := tree.cssselect('ol.pagination.actions')) and (next_block := page_block[0].cssselect('li.next')) else 1
return SimpleSearchWorkResult(
keyword=keyword, count=count, pageCount=page_count, page=page, works=works
)
else:
return SimpleSearchWorkResult(
code=1, keyword=keyword, count=-1, pageCount=-1, page=page, works=[]
)
network: Optional[Network] = None
async def startup() -> None:
global network
logger.info("Create httpx async client.")
network = Network(CONFIG['http_proxy'])
async def shutdown() -> None:
logger.info("Shutdown httpx async client.")
@asynccontextmanager
async def lifespan(app: FastAPI):
await startup()
yield
await shutdown()
def require_network() -> Optional[Network]:
if network is None: raise HTTPException(status_code=503, detail="Network client not initialized.")
return network
app = FastAPI(
lifespan=lifespan,
)
@app.exception_handler(Exceptions.NetworkTimeout)
async def network_timeout_handler(response: Response, exc: Exceptions.NetworkTimeout):
return JSONResponse(status_code=504, content={"code": 1, "endpoint": exc.endpoint})
@app.exception_handler(httpx.HTTPStatusError)
async def http_status_error_handler(response: Response, exc: httpx.HTTPStatusError):
return JSONResponse(status_code=502, content={"code": 2, 'endpoint': str(exc.request.url),'status': exc.response.status_code})
@app.exception_handler(Exceptions.Unauthorized)
async def http_status_error_handler(response: Response, exc: Exceptions.Unauthorized):
return JSONResponse(status_code=401, content={"code": 1, 'endpoint': exc.endpoint})
@app.get("/search/simple")
async def search_work_simple(
network: Network = Depends(require_network),
keyword: str = Query(...),
page: int = Query(1)
) -> SimpleSearchWorkResult:
result = await network.search_works(keyword, page)
await network.send_message(f'[Info] 简单搜索\n关键词: {keyword}\n总数: {result.count}\n页面: {result.page}/{result.pageCount}')
logger.info(f'Simple Search {keyword} count {result.count} page {result.page}/{result.pageCount}')
return result
@app.get("/work/{work_id}")
async def get_work(
network: Network = Depends(require_network),
work_id: int = Path(..., description="Work ID"),
) -> WorkDataResult:
try:
result = await network.get_page_data(work_id)
await network.send_message(f'[Info] 作品\nID: {result.workId}\n作者: {result.pseud}\n标题: {result.title}\n发布于: {result.stats.publishedTime}')
logger.info(f'Work {work_id} title: {result.title}')
except Exceptions.WorkNotFound as e:
logger.warning(f"Work not found: {e.work_id}")
raise JSONResponse(status_code=404, content={"code": 1, "work_id": e.work_id})
else: return result
@app.get("/work/{work_id}/{chapter_id}")
async def get_work_chapter(
network: Network = Depends(require_network),
work_id: int = Path(..., description="Work ID"),
chapter_id: int = Path(..., description="Chapter ID"),
) -> WorkDataResult:
try:
result = await network.get_page_data(work_id, chapter_id)
await network.send_message(f'[Info] 作品\nID: {result.workId}:{result.chapterId}\n作者: {result.pseud}\n标题: {result.title}\n发布于: {result.stats.publishedTime}')
logger.info(f'Work {work_id}:{chapter_id} title: {result.title}')
except Exceptions.WorkNotFound as e:
logger.warning(f"Work not found: {e.work_id} {e.chapter_id}")
return JSONResponse(status_code=404, content={"code": 1, "work_id": e.work_id, 'chapter_id': chapter_id})
else: return result
if __name__ == "__main__":
uvicorn.run("main:app", host="0.0.0.0", port=28001, log_level="debug" if CONFIG['dev'] else 'info', reload=CONFIG['dev'])