fedi_start/mastodon_cz_accounts.py

#!/usr/bin/env python3
"""
mastodon_cz_accounts.py
Sbírá CZ/SK účty z Mastodonu přes /api/v1/directory?language=cs
– stejná logika jako mstdn.cz od @adent.

Kritéria:
  - discoverable=true (uživatel chce být nalezen)
  - jazyk příspěvků nastaven na cs nebo sk
  - aktivní za posledních 30 dní
  - min. 10 příspěvků

Použití:
  python3 mastodon_cz_accounts.py
  python3 mastodon_cz_accounts.py --output /var/www/start/

Cron (každý den v 3:00):
  0 3 * * * /usr/bin/python3 /opt/mastodon-start/mastodon_cz_accounts.py --output /var/www/start/ >> /var/log/mastodon-start.log 2>&1
"""

import json, csv, time, re, argparse, logging, os
from datetime import datetime, timezone, timedelta
from pathlib import Path
import urllib.request, urllib.error, urllib.parse

def _load_token():
    token = os.environ.get("MASTODON_TOKEN")
    if token:
        return token.strip()
    env_path = Path(__file__).parent / ".env"
    if env_path.exists():
        for line in env_path.read_text().splitlines():
            line = line.strip()
            if line.startswith("MASTODON_TOKEN="):
                return line.split("=", 1)[1].strip()
            if line and not line.startswith("#") and "=" not in line:
                return line  # raw token value
    return None

MASTODON_TOKEN = _load_token()

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S")
log = logging.getLogger(__name__)

# ── CONFIG ────────────────────────────────────
QUERY_INSTANCES = [
    # CZ/SK instance – bereme všechny uživatele (bez language filtru)
    "mastodonczech.cz",   # 713 CZ uživatelů
    "cztwitter.cz",       # 229 CZ uživatelů
    "witter.cz",          # 212 CZ uživatelů
    "mastodon.arch-linux.cz",  # 115 CZ uživatelů
    "mastodon.pirati.cz", # 52 CZ uživatelů
    "f.cz",               # 40 CZ uživatelů
    "lgbtcz.social",      # 7 CZ uživatelů
    "boskovice.social",   # 5 CZ uživatelů
    "mamutovo.cz",
]

MIN_STATUSES      = 10
MIN_FOLLOWERS     = 10
MAX_DAYS_INACTIVE = 365
TOP_N             = 100
RATE_LIMIT_DELAY  = 1.2
PAGE_LIMIT        = 80
MAX_PAGES         = 10

# ── HTTP ──────────────────────────────────────
def api_get(url, timeout=15):
    headers = {"User-Agent": "MamutovoStarterBot/1.0 (+https://mamutovo.cz)"}
    if MASTODON_TOKEN:
        headers["Authorization"] = f"Bearer {MASTODON_TOKEN}"
    try:
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req, timeout=timeout) as r:
            return json.loads(r.read().decode())
    except urllib.error.HTTPError as e:
        if e.code == 429:
            log.warning("Rate limit – čekám 60s"); time.sleep(60)
        elif e.code not in (404, 410):
            log.debug(f"HTTP {e.code} {url}")
        return None
    except Exception as e:
        log.debug(f"Chyba {url}: {e}"); return None

# ── SBĚR ─────────────────────────────────────
def _fetch_small_instance(instance, seen_handles, all_accounts):
    """Malé CZ/SK instance: bereme všechny uživatele z directory."""
    log.info(f"directory {instance} ...")
    page = 0
    while page < MAX_PAGES:
        offset = page * PAGE_LIMIT
        url = (f"https://{instance}/api/v1/directory"
               f"?limit={PAGE_LIMIT}&local=true&offset={offset}")
        batch = api_get(url)
        if not batch or not isinstance(batch, list):
            break
        added = 0
        for acc in batch:
            acct = acc.get("acct", "")
            handle = acct if "@" in acct else f"{acct}@{instance}"
            if handle in seen_handles:
                continue
            seen_handles.add(handle)
            acc["_handle"] = handle
            acc["_source_instance"] = instance
            all_accounts.append(acc)
            added += 1
        log.debug(f"  {instance} offset={offset}: {added} nových")
        if len(batch) < PAGE_LIMIT:
            break
        page += 1
        time.sleep(RATE_LIMIT_DELAY)

def fetch_all_accounts():
    seen_handles = set()
    all_accounts = []
    for instance in QUERY_INSTANCES:
        _fetch_small_instance(instance, seen_handles, all_accounts)
        log.info(f"  → celkem {len(all_accounts)} unikátních účtů")
        time.sleep(RATE_LIMIT_DELAY)
    log.info(f"Sběr hotov: {len(all_accounts)} unikátních účtů")
    return all_accounts

# ── FILTRY ────────────────────────────────────
def passes_quality(acc):
    if acc.get("suspended") or acc.get("limited"):
        return False
    if (acc.get("statuses_count")  or 0) < MIN_STATUSES:  return False
    if (acc.get("followers_count") or 0) < MIN_FOLLOWERS: return False
    last = acc.get("last_status_at")
    if not last:
        return False
    try:
        dt = datetime.fromisoformat(last.replace("Z", "+00:00"))
        if dt < datetime.now(timezone.utc) - timedelta(days=MAX_DAYS_INACTIVE):
            return False
    except Exception:
        pass
    return True

# ── SCORING ───────────────────────────────────
def score(acc):
    followers = acc.get("followers_count", 0) or 0
    statuses  = acc.get("statuses_count",  0) or 0
    following = acc.get("following_count", 1) or 1
    f = min(40, int(40 * min(followers, 2000) / 2000))
    a = min(30, int(30 * min(statuses,  2000) / 2000))
    r = min(20, int(min(followers / max(following, 1), 4) * 5))
    handle = acc.get("_handle", "")
    instance = handle.split("@")[-1] if "@" in handle else ""
    b = 10 if any(x in instance for x in ("mamutovo", "czech")) else 0
    return min(100, f + a + r + b)

# ── KATEGORIE ─────────────────────────────────
CATEGORIES = {
    "tech":    ["linux", "python", "programov", "software", "opensource", "developer", "sysadmin", "git"],
    "foto":    ["fotografi", "foto", "photograph", "objektiv", "kamera"],
    "veda":    ["věda", "fyzika", "biologi", "astronom", "výzkum", "science", "matematik"],
    "kultura": ["knihy", "literatura", "film", "hudba", "divadlo", "umění"],
    "gaming":  ["gaming", "hry", "videohry", "steam", "gamer"],
    "zpravy":  ["novinář", "zprávy", "politik", "média", "journalist"],
}

def categorize(acc):
    text = re.sub(r"<[^>]+>", " ", acc.get("note", "") or "").lower()
    text += " " + (acc.get("display_name", "") or "").lower()
    for cat, kws in CATEGORIES.items():
        if any(kw in text for kw in kws):
            return cat
    return "ostatni"

def extract_tags(acc):
    text = re.sub(r"<[^>]+>", " ", acc.get("note", "") or "").lower()
    found = []
    for kws in CATEGORIES.values():
        for kw in kws:
            if kw in text and kw not in found and len(kw) > 3:
                found.append(kw.strip())
    return found[:4]

# ── VÝSTUP ────────────────────────────────────
def build_output(raw):
    results = []
    for acc in raw:
        if not passes_quality(acc):
            continue
        handle = acc.get("_handle", acc.get("acct", ""))
        bio = re.sub(r"<[^>]+>", " ", acc.get("note", "") or "").strip()
        results.append({
            "name":        acc.get("display_name") or acc.get("username", ""),
            "handle":      handle,
            "bio":         bio[:220],
            "avatar":      acc.get("avatar", ""),
            "followers":   acc.get("followers_count", 0),
            "statuses":    acc.get("statuses_count",  0),
            "score":       score(acc),
            "tags":        extract_tags(acc),
            "category":    categorize(acc),
            "last_active": acc.get("last_status_at", ""),
            "url":         acc.get("url", ""),
        })
    seen = set()
    unique = []
    for r in sorted(results, key=lambda x: x["followers"], reverse=True):
        if r["handle"] not in seen:
            seen.add(r["handle"])
            unique.append(r)
    return unique[:TOP_N]

def write_json(accounts, output_dir):
    data = {"generated_at": datetime.now(timezone.utc).isoformat(), "count": len(accounts), "accounts": accounts}
    p = output_dir / "accounts.json"
    p.write_text(json.dumps(data, ensure_ascii=False, indent=2))
    log.info(f"JSON: {p} ({len(accounts)} účtů)")

def write_csv(accounts, output_dir):
    p = output_dir / "accounts.csv"
    with open(p, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["Account address", "Show boosts"])
        for a in accounts:
            w.writerow([a["handle"], "true"])
    log.info(f"CSV:  {p}")

# ── MAIN ──────────────────────────────────────
def main():
    global TOP_N
    parser = argparse.ArgumentParser()
    parser.add_argument("--output", default=".", help="Výstupní adresář")
    parser.add_argument("--top",    default=TOP_N, type=int)
    parser.add_argument("--debug",  action="store_true")
    args = parser.parse_args()
    if args.debug:
        logging.getLogger().setLevel(logging.DEBUG)
    TOP_N = args.top
    output_dir = Path(args.output)
    output_dir.mkdir(parents=True, exist_ok=True)
    log.info(f"Startuji – {len(QUERY_INSTANCES)} instancí")
    raw = fetch_all_accounts()
    accounts = build_output(raw)
    if not accounts:
        log.error("Žádné účty! Zkontroluj připojení.")
        return 1
    log.info(f"Po filtraci: {len(accounts)} účtů")
    write_json(accounts, output_dir)
    write_csv(accounts, output_dir)
    log.info("Hotovo.")
    return 0

if __name__ == "__main__":
    exit(main())