parser: reuse session for attachments, deduplicate news methods, remove unnecessary async

parent 0d2af83a
...@@ -17,6 +17,11 @@ class BaseParser: ...@@ -17,6 +17,11 @@ class BaseParser:
resp.raise_for_status() resp.raise_for_status()
return await resp.text(encoding=encoding) return await resp.text(encoding=encoding)
async def get_bytes(self, url: str) -> bytes:
async with self.session.get(url) as resp:
resp.raise_for_status()
return await resp.read()
class NewsInfo: class NewsInfo:
def __init__(self, client: BaseParser): def __init__(self, client: BaseParser):
...@@ -25,33 +30,28 @@ class NewsInfo: ...@@ -25,33 +30,28 @@ class NewsInfo:
async def news_urls(self) -> models.NewsURL: async def news_urls(self) -> models.NewsURL:
return await urls_parser(self.client) return await urls_parser(self.client)
async def bugs(self) -> models.BugsModel: async def _get_packages(self, branch: str) -> models.PackagesModel | None:
url = (await self.news_urls()).bugs url = getattr(await self.news_urls(), branch, None)
if not url: if not url:
return None return None
html = await self.client.get(url, "koi8-r") html = await self.client.get(url, "koi8-r")
return await bugs_parser(html, url) return await packages_parser(html, url, self.client)
async def sisyphus(self) -> models.PackagesModel | None: async def bugs(self) -> models.BugsModel | None:
url = (await self.news_urls()).sisyphus url = (await self.news_urls()).bugs
if not url: if not url:
return None return None
html = await self.client.get(url, "koi8-r") html = await self.client.get(url, "koi8-r")
return await packages_parser(html, url) return await bugs_parser(html, url)
async def sisyphus(self) -> models.PackagesModel | None:
return await self._get_packages("sisyphus")
async def p11(self) -> models.PackagesModel | None: async def p11(self) -> models.PackagesModel | None:
url = (await self.news_urls()).p11 return await self._get_packages("p11")
if not url:
return None
html = await self.client.get(url, "koi8-r")
return await packages_parser(html, url)
async def p10(self) -> models.PackagesModel | None: async def p10(self) -> models.PackagesModel | None:
url = (await self.news_urls()).p10 return await self._get_packages("p10")
if not url:
return None
html = await self.client.get(url, "koi8-r")
return await packages_parser(html, url)
class PackagesInfo: class PackagesInfo:
...@@ -61,7 +61,7 @@ class PackagesInfo: ...@@ -61,7 +61,7 @@ class PackagesInfo:
async def ftbfs(self) -> List[models.FTBFSModel]: async def ftbfs(self) -> List[models.FTBFSModel]:
url = "https://git.altlinux.org/beehive/stats/Sisyphus-x86_64/ftbfs-joined" url = "https://git.altlinux.org/beehive/stats/Sisyphus-x86_64/ftbfs-joined"
text = await self.client.get(url) text = await self.client.get(url)
return await ftbfs_parser(text) return ftbfs_parser(text)
async def watch_by_maintainer( async def watch_by_maintainer(
self, self,
...@@ -71,7 +71,7 @@ class PackagesInfo: ...@@ -71,7 +71,7 @@ class PackagesInfo:
url = f"https://watch.altlinux.org/pub/watch/{by_acl}/{maintainer_nickname}.txt" url = f"https://watch.altlinux.org/pub/watch/{by_acl}/{maintainer_nickname}.txt"
try: try:
text = await self.client.get(url) text = await self.client.get(url)
return await watch_parser(text) return watch_parser(text)
except: except:
return [] return []
......
...@@ -28,7 +28,7 @@ async def bugs_parser(html: str, url: str): ...@@ -28,7 +28,7 @@ async def bugs_parser(html: str, url: str):
current_bug = None current_bug = None
description_buffer = "" description_buffer = ""
section_name = await _get_bug_section_name(line) section_name = _get_bug_section_name(line)
continue continue
bug_match = bug_pattern.match(line) bug_match = bug_pattern.match(line)
...@@ -64,7 +64,7 @@ async def bugs_parser(html: str, url: str): ...@@ -64,7 +64,7 @@ async def bugs_parser(html: str, url: str):
return models.BugsModel(**data) return models.BugsModel(**data)
async def _get_bug_section_name(line: str) -> str: def _get_bug_section_name(line: str) -> str:
line = line.lower() line = line.lower()
if "new" in line and "resolved" in line: if "new" in line and "resolved" in line:
return "quickly_resolved" return "quickly_resolved"
......
import aiohttp
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import re import re
import io import io
...@@ -6,7 +5,7 @@ import gzip ...@@ -6,7 +5,7 @@ import gzip
from .. import models from .. import models
async def packages_parser(html: str, url: str): async def packages_parser(html: str, url: str, client=None):
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
pre_tag = soup.find("pre") pre_tag = soup.find("pre")
if not pre_tag: if not pre_tag:
...@@ -15,9 +14,11 @@ async def packages_parser(html: str, url: str): ...@@ -15,9 +14,11 @@ async def packages_parser(html: str, url: str):
pre_text = pre_tag.get_text(strip=True) pre_text = pre_tag.get_text(strip=True)
if "Было удалено вложение" in pre_text and "attachment" in pre_text: if "Было удалено вложение" in pre_text and "attachment" in pre_text:
attachment_link = pre_tag.find("a", href=True) attachment_link = pre_tag.find("a", href=True)
if attachment_link: if attachment_link and client:
attachment_url = attachment_link["href"] attachment_url = attachment_link["href"]
text = await _fetch(attachment_url) compressed_data = await client.get_bytes(attachment_url)
with gzip.GzipFile(fileobj=io.BytesIO(compressed_data)) as gz:
text = gz.read().decode('utf-8')
else: else:
return models.PackagesModel(**{"added": [], "removed": [], "updated": [], "url": "none"}) return models.PackagesModel(**{"added": [], "removed": [], "updated": [], "url": "none"})
else: else:
...@@ -71,7 +72,7 @@ async def packages_parser(html: str, url: str): ...@@ -71,7 +72,7 @@ async def packages_parser(html: str, url: str):
current_package = { current_package = {
"name": match.group(1), "name": match.group(1),
"description": await _clean_description(match.group(2)), "description": _clean_description(match.group(2)),
} }
seen_changelog = False seen_changelog = False
continue continue
...@@ -100,19 +101,8 @@ async def packages_parser(html: str, url: str): ...@@ -100,19 +101,8 @@ async def packages_parser(html: str, url: str):
return models.PackagesModel(**sections) return models.PackagesModel(**sections)
async def _clean_description(desc: str): def _clean_description(desc: str):
desc = desc.strip() desc = desc.strip()
desc = re.sub(r'\s+', ' ', desc) desc = re.sub(r'\s+', ' ', desc)
desc = re.sub(r'\[\d+[KMG]?\]', '', desc).strip() desc = re.sub(r'\[\d+[KMG]?\]', '', desc).strip()
return desc return desc
async def _fetch(url: str) -> str:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
response.raise_for_status()
compressed_data = await response.read()
with gzip.GzipFile(fileobj=io.BytesIO(compressed_data)) as gz:
text = gz.read().decode('utf-8')
return text
from .. import models from .. import models
async def ftbfs_parser(text: str): def ftbfs_parser(text: str):
packages = [] packages = []
for line in text.strip().splitlines(): for line in text.strip().splitlines():
parts = line.split('\t') parts = line.split('\t')
......
from .. import models from .. import models
async def watch_parser(text: str): def watch_parser(text: str):
return [ return [
models.WatchByMaintainerModel( models.WatchByMaintainerModel(
pkg_name=parts[0], pkg_name=parts[0],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment