第一次提交

2025-06-09 17:36:34 +08:00
commit 8ca338ce46
2 changed files with 567 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+__pycache__/*
+dev
--- a/main.py
+++ b/main.py
@ -0,0 +1,565 @@
+import os
+import time
+import random
+import logging
+import uuid
+import inspect
+import copy
+import re
+from urllib import parse as uparse
+from datetime import datetime
+from typing import Optional, List, Any, Tuple, Dict, Callable
+from contextlib import asynccontextmanager
+from enum import Enum
+
+import ujson
+
+from fastapi import FastAPI, HTTPException, Request, Response, File, Form, Query, UploadFile, Depends, Path
+from fastapi.responses import HTMLResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, create_model
+from pydantic.dataclasses import dataclass
+from lxml import html
+from httpx import AsyncClient, TimeoutException
+from httpx_retries import RetryTransport, Retry
+from urllib.parse import urlparse
+import httpx
+
+import uvicorn
+
+CONFIG: Dict[str, str] = {
+	'http_proxy': 'http://10.0.0.3:20171/',
+	'dev': os.path.isfile('./dev')
+}
+
+logging.basicConfig(level=logging.WARNING)
+
+logger = logging.getLogger("cache")
+logger.setLevel(logging.DEBUG if CONFIG['dev'] else logging.WARNING)
+
+logger = logging.getLogger("network")
+logger.setLevel(logging.DEBUG if CONFIG['dev'] else logging.WARNING)
+
+logger = logging.getLogger("main")
+logger.setLevel(logging.DEBUG if CONFIG['dev'] else logging.INFO)
+
+class Exceptions:
+	class WorkNotFound(Exception):
+		def __init__(self, wid: int, cid: Optional[int] = None, *args: Any):
+			super().__init__(*args)
+			self.work_id = wid
+			self.chapter_id = cid
+	class NetworkTimeout(Exception):
+		def __init__(self, endpoint, *args: Any):
+			super().__init__(*args)
+			self.endpoint = endpoint
+	class Unauthorized(Exception):
+		def __init__(self, endpoint, *args: Any):
+			super().__init__(*args)
+			self.endpoint = endpoint
+
+class RE:
+	author_pseuds = re.compile(r"^/users/([^/]+)/pseuds/([^/]+)$")
+	chapter_work = re.compile(r"/works/(\d+)/chapters/(\d+)")
+@dataclass
+class Pair:
+	left: Any
+	right: Any
+
+class Util:
+	@staticmethod
+	def cleanInt(string: str) -> int:
+		digits_only = ''.join(filter(str.isdigit, string))
+		number = int(digits_only)
+		if string and string[0] == '-':
+			number = -number
+		return number
+
+	@staticmethod
+	def html_to_text_with_newlines(stree: html.HtmlElement) -> str:
+		tree = copy.deepcopy(stree)
+		for br in tree.xpath('//br'):
+			if br.tail:
+				br.tail = '\n' + br.tail
+			else:
+				br.tail = '\n'
+
+		for p in tree.xpath('//p'):
+			if p.tail:
+				p.tail = '\n\n' + p.tail
+			else:
+				p.tail = '\n\n'
+		return tree.text_content()
+	
+	@staticmethod
+	def split_lines(tree) -> list[str]:
+		return filter(lambda x: bool(x), tree.split('\n'))
+
+	class AsyncCache:
+		def __init__(self, fallback: Callable[[str], float] = None, expire: Optional[int] = None):
+			self.expire = expire
+			self.cache: dict[Pair] = {}
+			self.fallback = fallback
+			self.logger = logging.getLogger("cache")
+
+		async def __call__(self, key: str) -> Any: return await  self.get(key)
+		async def get(self, key: str) -> Any:
+			if key in self.cache:
+				item = self.cache[key]
+				if self.expire:
+					if self.expire < 0:
+						self.logger.debug(f'Miss {key} skip')
+						return await self.resolve(key)
+					if time.time() - item.right > self.expire:
+						self.logger.debug(f'Miss {key} expired')
+						return await self.resolve(key)
+					else:
+						self.logger.debug(f'Hit {key}')
+						return item.left
+				else:
+					self.logger.debug(f'Hit {key} skip')
+					return item.left
+			else:
+				self.logger.debug(f'Miss {key} notfound')
+				return await self.resolve(key)
+		def set(self, key: str, value: Any):
+			self.logger.debug(f'Set {key}')
+			self.cache[key] = Pair(value, time.time())
+		async def resolve(self, key: str) -> Any:
+			self.logger.debug(f'Resolve {key}')
+			if self.fallback:
+				st = time.time()
+				value = self.fallback(key)
+				if inspect.isawaitable(value):
+					self.logger.debug(f'Resolve {key} await')
+					value = await value
+				et = time.time()
+				self.set(key, value)
+				self.logger.debug(f'Resolve {key} done in {et-st:.4f}')
+				return value
+			else:
+				self.logger.warning(f'No fallback function')
+				raise KeyError(key)
+
+class Category(Enum):
+	FF = "ff"  # GL
+	FM = "fm"  # BG
+	MM = "mm"  # BL
+	NONE = "none"
+	MULTI = "multi"
+	UNKNOWN = "unknown"
+
+	@classmethod
+	def parse(cls, typ: str) -> "Category":
+		match typ.strip():
+			case 'F/F': return cls.FF
+			case 'F/M': return cls.FM
+			case 'M/M': return cls.MM
+			case 'Gen': return cls.NONE
+			case 'Multi': return cls.MULTI
+			case _: return cls.UNKNOWN
+
+@dataclass
+class AO3Time:
+	year: int
+	month: int
+	date: int
+
+	_mon_name = {
+		"jan": 1,
+		"feb": 2,
+		"mar": 3,
+		"apr": 4,
+		"may": 5,
+		"jun": 6,
+		"jul": 7,
+		"aug": 8,
+		"sept": 9,
+		"oct": 10,
+		"nov": 11,
+		"dec": 12,
+	}
+
+	@classmethod
+	def parse(cls, string: str) -> "AO3Time":
+		year, month, date = string.strip().split('-',2)
+		return cls(
+			year=int(year),
+			month=int(month),
+			date=int(date),
+		)
+
+	@classmethod
+	def parse1(cls, string: str) -> "AO3Time":
+		date, month, year = string.strip().split(' ',2)
+		return cls(
+			year=int(year),
+			month=cls._mon_name.get(month.lower(),-1),
+			date=int(date),
+		)
+	
+	def __repr__(self) -> str: return f'{self.year}-{self.month}-{self.date}'
+
+@dataclass
+class AO3WorkStat:
+	publishedTime: AO3Time
+	wordCount: int
+	hitCount: int
+	kudoCount: Optional[int] = None
+	commentCount: Optional[int] = None
+	bookmarkCount: Optional[int] = None
+	chapter: Optional[Pair] = None
+	updatedTime: Optional[AO3Time] = None
+	completedTime: Optional[AO3Time] = None
+
+@dataclass
+class WorkDataResult:
+	@dataclass
+	class ChapterItem:
+		title: str
+		chapterId: int
+	workId: int
+	chapterId: Optional[int]
+	title: str
+	text: list[str]
+	pseud: str
+	lang: str
+	stats: AO3WorkStat
+	summary: Optional[str] = None
+	fandom: Optional[list[str]] = None
+	category: Optional[list[Category]] = None
+	relationship: Optional[list[str]] = None
+	additionalTags: Optional[list[str]] = None
+	code: int = 0
+	chapters: Optional[list[ChapterItem]] = None 
+	chapterIndex: Optional[int] = None
+
+@dataclass
+class SearchWorkItem:
+	workId: int
+	title: str
+	pseud: str
+	author: str
+	summary: str
+	stats: AO3WorkStat
+	giftTo: Optional[str] = None
+
+
+@dataclass
+class SimpleSearchWorkResult:
+	keyword: str
+	count: int
+	pageCount: int
+	page: int
+	works: list[SearchWorkItem]
+	code: int = 0
+
+class Network:
+	def __init__(self, proxy: Optional[str] = None) -> None:
+		self.logger = logging.getLogger("network")
+		self.http_proxy: Optional[str] = proxy
+		self.client = AsyncClient(
+			proxy=self.http_proxy,
+			transport=RetryTransport(retry=Retry(
+				total=5,
+				backoff_factor=0.5
+			))
+		)
+		self.xmpp_client = httpx.AsyncClient(timeout=10)
+		self._get = Util.AsyncCache(self._real_get, 36000000 if CONFIG['dev'] else 1800)
+		self.works = {}
+
+	async def send_message(self, msg: str):
+		url = "http://10.0.0.3:52222/send"
+		headers = {"Content-Type": "application/json"}
+		payload = {
+			"alias": "ao3mirror",
+			"message": msg
+		}
+		try:
+			response = await self.xmpp_client.post(url, headers=headers, json=payload)
+			response.raise_for_status()
+		except httpx.HTTPError as e: self.logger.error(e)
+
+	async def _real_get(self, uri: str) -> httpx.Response:
+		url = urlparse(uri)
+		try:
+			self.logger.debug(f'HTTP Get {uri}')
+			response = await self.client.get(uri, headers=self._build_headers(url.scheme, url.netloc))
+		except httpx.TimeoutException as e:
+			raise Exceptions.NetworkTimeout(uri) from e
+		response.raise_for_status()
+		return response
+
+	def _build_headers(self, scheme: str, host: str) -> Dict[str, str]:
+		return {
+			"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:137.0) Gecko/20100101 Firefox/137.0",
+			"Referer": f"{scheme}://{host}/",
+			"Origin": f"{scheme}://{host}"
+		}
+
+	async def _get_page_data(self, work_id: int, chapter_id: Optional[int] = None) -> WorkDataResult:
+		url = (
+			f'https://archiveofourown.org/works/{work_id}/chapters/{chapter_id}?view_adult=yes' if chapter_id
+			else f'https://archiveofourown.org/works/{work_id}?view_adult=yes'
+		)
+		try: response = await self._get(url)
+		except httpx.HTTPStatusError as e:
+			if e.response.status_code == 404: 	raise Exceptions.WorkNotFound(work_id, chapter_id)
+			elif e.response.status_code == 302:
+				newurl = e.response.headers.get('location')
+				if (result := RE.chapter_work.match(newurl)) and (ids := result.groups()):
+					work_id, chapter_id = ids
+					self.logger.info(f'Work {work_id} has first chapter {chapter_id}')
+					try:
+						work_id = Util.cleanInt(work_id)
+						chapter_id = Util.cleanInt(chapter_id)
+					except ValueError as e1: raise e1 from e
+					return await self.get_page_data(work_id, chapter_id)
+				elif newurl.endswith('/users/login?restricted=true'): raise Exceptions.Unauthorized(url)
+				else:
+					logger.debug(f'Unknown 302 to: {e.response.headers.get("location")}')
+					raise
+			else:
+				self.logger.debug(f'Unknown code: {e.response.status_code}')
+				raise
+		return self.parse_page_data(response.text, work_id, chapter_id)
+
+	async def get_page_data(self, work_id: int, chapter_id: Optional[int] = None) -> WorkDataResult:
+		wid = (work_id, chapter_id)
+		if wid in self.works:
+			self.logger.debug(f"Work {work_id}:{chapter_id} hit cache")
+			return self.works[wid]
+		else:
+			self.logger.debug(f"Work {work_id}:{chapter_id} miss cache")
+			result = await self._get_page_data(work_id, chapter_id)
+			self.works[wid] = result
+			return result
+
+	def parse_page_data(self, result: str, work_id: int, chapter_id: Optional[int]) -> WorkDataResult:
+		tree = html.fromstring(result)
+		meta_block = tree.cssselect('div.wrapper')[0].cssselect('dl.work')[0]
+		stats_block = meta_block.cssselect('dl.stats')[0]
+
+		if (tblock := stats_block.cssselect('dt.status')) and (dblock := stats_block.cssselect('dd.status')):
+			tblock = tblock[0]
+			dblock = dblock[0]
+			match tblock.text_content().strip().lower():
+				case 'completed:':
+					completedTime = AO3Time.parse(dblock.text_content())
+					updatedTime = None
+				case 'updated:':
+					completedTime = None
+					updatedTime = AO3Time.parse(dblock.text_content())
+				case _:
+					completedTime = None
+					updatedTime = None
+		else:
+			completedTime = None
+			updatedTime = None
+
+		if block := meta_block.cssselect('dd.category'):
+			categories = [Category.parse(item.text_content().strip()) for item in block[0].cssselect('li a')]
+		else: categories = []
+		
+		if block := meta_block.cssselect('dd.fandom'):
+			fandoms = [item.text_content().strip() for item in block[0].cssselect('li a')]
+		else: fandoms = []
+		
+		if block := meta_block.cssselect('dd.relationship'):
+			relationships = [item.text_content().strip() for item in block[0].cssselect('li a')]
+		else: relationships = []
+
+		if block := meta_block.cssselect('dd.freeform.tags'):
+			additionalTags = [item.text_content().strip() for item in block[0].cssselect('li a')]
+		else: additionalTags = []
+		
+		body_block = tree.xpath('//*[@id="workskin"]')[0]
+		preface_block = body_block.cssselect('.preface')[0]
+
+		if (chapter_block := meta_block.cssselect('dd.chapters')):
+			chapter_block = chapter_block[0]
+			left, right = chapter_block.text_content().split('/')
+			right = right.strip()
+			if right == '?': right = -1
+			else: right = Util.cleanInt(right)
+			chapter = Pair(Util.cleanInt(left), right)
+		else: chapter = None
+		
+		text = []
+		for p in body_block.cssselect('div.userstuff p'):
+			text.extend(Util.split_lines(Util.html_to_text_with_newlines(p)))
+
+		if chapter_id:
+			chapters = []
+			chapterIndex = 0
+			index = 0
+			if chapter_block := tree.cssselect('div.work ul.work.navigation.actions li.chapter ul#chapter_index.expandable.secondary li form'):
+				for chapter_option in chapter_block[0].cssselect('select#selected_id')[0].cssselect('option'):
+					title = chapter_option.text_content().split('.')[1].strip()
+					cchapter_id = int(chapter_option.attrib['value'])
+					chapters.append(WorkDataResult.ChapterItem(title, cchapter_id))
+					if chapter_id == cchapter_id: chapterIndex = index
+					index += 1
+			else: chapters.append(WorkDataResult.ChapterItem(tree.cssselect('div#workskin div#chapters h3.title')[0].text_content().split(':',1)[1].strip(), chapter_id))
+		else:
+			chapters = chapterIndex = None
+
+		summary = (
+			Util.html_to_text_with_newlines(summary_block[0]).strip()
+			if (summary_block := tree.cssselect('blockquote.userstuff')) else None
+		)
+
+		return WorkDataResult(
+			workId=work_id, chapterId=chapter_id, chapterIndex=chapterIndex,
+			title=preface_block.cssselect("h2.title")[0].text_content().strip(),
+			summary=summary, text=text,
+			stats=AO3WorkStat(
+				publishedTime=AO3Time.parse(stats_block.cssselect('dd.published')[0].text_content()),
+				completedTime = completedTime, updatedTime = updatedTime,
+				wordCount=Util.cleanInt(stats_block.cssselect('dd.words')[0].text_content()),
+				#kudoCount=Util.cleanInt(stats_block.cssselect('dd.kudos')[0].text_content()),
+				hitCount=Util.cleanInt(stats_block.cssselect('dd.hits')[0].text_content()),
+				chapter=chapter
+			),
+			category=categories, fandom=fandoms,
+			relationship=relationships, additionalTags = additionalTags,
+			lang=meta_block.cssselect('dd.language')[0].text_content().strip(),
+			pseud=preface_block.cssselect('.byline')[0].text_content().strip(),
+			chapters=chapters
+		)
+
+	async def search_works(self, keyword: str, page: int = 1) -> SimpleSearchWorkResult:
+		page = abs(page)
+		url = f"https://archiveofourown.org/works/search?work_search%5Bquery%5D={uparse.quote(keyword)}" if page in (0, 1) else \
+			f"https://archiveofourown.org/works/search?work_search%5Bquery%5D={uparse.quote(keyword)}&page={page}"
+		response = await self._get(url)
+		return self.parse_search_result(response.text, keyword, page)
+
+	def parse_search_result(self, result: str, keyword: str, page: int = 1) -> SimpleSearchWorkResult:
+		tree = html.fromstring(result)
+		count_block = tree.cssselect('h3.heading')[0].text_content()
+		if 'Found' in count_block:
+			count = Util.cleanInt(count_block.split('Found', 1)[0])
+			work_blocks = tree.cssselect('ol.work.index.group > li')
+			works = []
+			for block in work_blocks:
+				ass = block.cssselect('div.header.module > h4.heading > a')
+				match len(ass):
+					case 1: b1 = ass[0]; b2 = None; b3 = None
+					case 2: b1, b2 = ass; b3 = None
+					case 3:
+						b1, b2, b3 = ass
+						b3 = b3.attrib['href'].split('/')[-2]
+					case _: continue
+				if b1 is not None: work_id = int(b1.attrib['href'].split('/')[-1])
+				else: continue
+				if b2 is not None and (match := RE.author_pseuds.match(b2.attrib['href'])):
+					username, pseud = match.groups()
+				else: username = pseud = ''
+				datetime_block = block.cssselect('div.header.module > p.datetime')[0]
+				summary = Util.html_to_text_with_newlines(summary_block[0]) if (summary_block := block.cssselect('blockquote.userstuff')) else ''
+				kudoCount = Util.cleanInt(kudo_block[0].text_content()) if (kudo_block := block.cssselect('dd.kudos')) else None
+				stats_block = block.cssselect('dl.stats')[0]
+				works.append(SearchWorkItem(
+					workId=work_id,
+					title=b1.text_content().strip(),
+					author=username, pseud=pseud, giftTo=b3,
+					summary=summary.strip(),
+					stats=AO3WorkStat(
+						publishedTime=AO3Time.parse1(datetime_block.text_content()),
+						wordCount=Util.cleanInt(stats_block.cssselect('dd.words')[0].text_content()),
+						kudoCount=kudoCount,
+						hitCount=Util.cleanInt(stats_block.cssselect('dd.hits')[0].text_content()),
+					),
+				))
+			page_count = Util.cleanInt(next_block[0].getprevious().text_content()) \
+				if (page_block := tree.cssselect('ol.pagination.actions')) and (next_block := page_block[0].cssselect('li.next')) else 1
+			return SimpleSearchWorkResult(
+				keyword=keyword, count=count, pageCount=page_count, page=page, works=works
+			)
+		else:
+			return SimpleSearchWorkResult(
+				code=1, keyword=keyword, count=-1, pageCount=-1, page=page, works=[]
+			)
+
+network: Optional[Network] = None
+
+async def startup() -> None:
+	global network
+	logger.info("Create httpx async client.")
+	network = Network(CONFIG['http_proxy'])
+
+async def shutdown() -> None:
+	logger.info("Shutdown httpx async client.")
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+	await startup()
+	yield
+	await shutdown()
+
+def require_network() -> Optional[Network]:
+	if network is None: raise HTTPException(status_code=503, detail="Network client not initialized.")
+	return network
+
+app = FastAPI(
+	lifespan=lifespan,
+)
+
+@app.exception_handler(Exceptions.NetworkTimeout)
+async def network_timeout_handler(response: Response, exc: Exceptions.NetworkTimeout):
+	return JSONResponse(status_code=504, content={"code": 1, "endpoint": exc.endpoint})
+	
+@app.exception_handler(httpx.HTTPStatusError)
+async def http_status_error_handler(response: Response, exc: httpx.HTTPStatusError):
+	return JSONResponse(status_code=502, content={"code": 2, 'endpoint': str(exc.request.url),'status': exc.response.status_code})
+
+@app.exception_handler(Exceptions.Unauthorized)
+async def http_status_error_handler(response: Response, exc: Exceptions.Unauthorized):
+	return JSONResponse(status_code=401, content={"code": 1, 'endpoint': exc.endpoint})
+
+@app.get("/search/simple")
+async def search_work_simple(
+	network: Network = Depends(require_network),
+	keyword: str = Query(...),
+	page: int = Query(1)
+) -> SimpleSearchWorkResult:
+	result = await network.search_works(keyword, page)
+	await network.send_message(f'[Info] 简单搜索\n关键词: {keyword}\n总数: {result.count}\n页面: {result.page}/{result.pageCount}')
+	logger.info(f'Simple Search {keyword} count {result.count} page {result.page}/{result.pageCount}')
+	return result
+
+@app.get("/work/{work_id}")
+async def get_work(
+	network: Network = Depends(require_network),
+	work_id: int = Path(..., description="Work ID"),
+) -> WorkDataResult:
+	try:
+		result = await network.get_page_data(work_id)
+		await network.send_message(f'[Info] 作品\nID: {result.workId}\n作者: {result.pseud}\n标题: {result.title}\n发布于: {result.stats.publishedTime}')
+		logger.info(f'Work {work_id} title: {result.title}')
+	except Exceptions.WorkNotFound as e:
+		logger.warning(f"Work not found: {e.work_id}")
+		raise JSONResponse(status_code=404, content={"code": 1, "work_id": e.work_id})
+	else: return result
+
+@app.get("/work/{work_id}/{chapter_id}")
+async def get_work_chapter(
+	network: Network = Depends(require_network),
+	work_id: int = Path(..., description="Work ID"),
+	chapter_id: int = Path(..., description="Chapter ID"),
+) -> WorkDataResult:
+	try:
+		result = await network.get_page_data(work_id, chapter_id)
+		await network.send_message(f'[Info] 作品\nID: {result.workId}:{result.chapterId}\n作者: {result.pseud}\n标题: {result.title}\n发布于: {result.stats.publishedTime}')
+		logger.info(f'Work {work_id}:{chapter_id} title: {result.title}')
+	except Exceptions.WorkNotFound as e:
+		logger.warning(f"Work not found: {e.work_id} {e.chapter_id}")
+		return JSONResponse(status_code=404, content={"code": 1, "work_id": e.work_id, 'chapter_id': chapter_id})
+	else: return result
+
+if __name__ == "__main__":
+	uvicorn.run("main:app", host="0.0.0.0", port=28001, log_level="debug" if CONFIG['dev'] else 'info', reload=CONFIG['dev'])
+