2025-10-06 17:51:52 -04:00
|
|
|
import csv
|
2025-10-08 18:02:47 -04:00
|
|
|
import math
|
|
|
|
|
import random
|
2025-10-06 17:51:52 -04:00
|
|
|
|
2025-10-08 18:02:47 -04:00
|
|
|
import requests
|
2025-10-07 11:08:04 -04:00
|
|
|
from bs4 import BeautifulSoup, Tag
|
2025-10-06 17:51:52 -04:00
|
|
|
|
|
|
|
|
|
2025-10-08 18:02:47 -04:00
|
|
|
def repo_color(name: str) -> str:
|
|
|
|
|
lower_name = name.lower()
|
|
|
|
|
if lower_name.startswith("ubuntu"):
|
|
|
|
|
return "#e95420"
|
|
|
|
|
if lower_name.startswith("nixpkgs"):
|
|
|
|
|
return "#4f73bd"
|
|
|
|
|
if lower_name.startswith("debian"):
|
|
|
|
|
return "#d80150"
|
|
|
|
|
if lower_name.startswith("alpine"):
|
|
|
|
|
return "#0d597f"
|
|
|
|
|
if lower_name.startswith("fedora"):
|
|
|
|
|
return "#294072"
|
|
|
|
|
if lower_name.startswith("aur"):
|
|
|
|
|
return "#1793d1"
|
2025-10-06 17:51:52 -04:00
|
|
|
|
2025-10-08 18:02:47 -04:00
|
|
|
return f"#{math.floor(random.random() * 0xFFFFFF):06x}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
URL = "https://repology.org/repositories/packages"
|
|
|
|
|
HEADERS = {
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:142.0) Gecko/20100101 Firefox/142.0"
|
|
|
|
|
}
|
|
|
|
|
response = requests.get(URL, headers=HEADERS)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
|
|
|
|
rows: list[tuple[str, str, str, str]] = []
|
2025-10-06 17:51:52 -04:00
|
|
|
|
|
|
|
|
for tr in soup.select("tbody > tr"):
|
2025-10-08 18:02:47 -04:00
|
|
|
# First column: repo name
|
2025-10-06 17:51:52 -04:00
|
|
|
name_tag = tr.select_one("th a")
|
|
|
|
|
name = name_tag.get_text(strip=True) if name_tag else ""
|
|
|
|
|
|
2025-10-08 18:02:47 -04:00
|
|
|
# Second and third columns: prefer span[title], fall back to span text
|
2025-10-06 17:51:52 -04:00
|
|
|
td_tags = tr.select("td")
|
|
|
|
|
if len(td_tags) >= 2:
|
|
|
|
|
|
2025-10-07 11:08:04 -04:00
|
|
|
def extract_value(td: Tag) -> str:
|
2025-10-06 17:51:52 -04:00
|
|
|
span = td.select_one("span")
|
|
|
|
|
if span:
|
|
|
|
|
# Prefer title attribute, else text content
|
2025-10-07 11:08:04 -04:00
|
|
|
title = span.get("title")
|
|
|
|
|
if title is None:
|
|
|
|
|
title = span.get_text(strip=True)
|
|
|
|
|
elif type(title) is not str:
|
|
|
|
|
title = title[0]
|
|
|
|
|
return title
|
2025-10-06 17:51:52 -04:00
|
|
|
# Sometimes there's no <span>, just text inside <a> or <td>
|
|
|
|
|
return td.get_text(strip=True)
|
|
|
|
|
|
|
|
|
|
packages = extract_value(td_tags[0])
|
|
|
|
|
fresh_packages = extract_value(td_tags[1])
|
|
|
|
|
else:
|
|
|
|
|
packages = fresh_packages = ""
|
|
|
|
|
|
2025-10-08 18:02:47 -04:00
|
|
|
color = repo_color(name)
|
|
|
|
|
|
|
|
|
|
rows.append((name, color, packages, fresh_packages))
|
2025-10-06 17:51:52 -04:00
|
|
|
|
|
|
|
|
# Write to CSV
|
|
|
|
|
with open("packages.csv", "w", newline="", encoding="utf-8") as f:
|
|
|
|
|
writer = csv.writer(f)
|
2025-10-08 18:02:47 -04:00
|
|
|
writer.writerow(["name", "color", "packages", "freshPackages"])
|
2025-10-06 17:51:52 -04:00
|
|
|
writer.writerows(rows)
|
|
|
|
|
|
|
|
|
|
print("✅ Extracted", len(rows), "rows into packages.csv")
|