fedi_start/mastodon_cz_accounts.py

#!/usr/bin/env python3
"""
mastodon_cz_accounts.py
Sbírá CZ/SK účty z Mastodonu přes /api/v1/directory?language=cs
– stejná logika jako mstdn.cz od @adent.

Kritéria:
  - discoverable=true (uživatel chce být nalezen)
  - jazyk příspěvků nastaven na cs nebo sk
  - aktivní za posledních 30 dní
  - min. 10 příspěvků

Použití:
  python3 mastodon_cz_accounts.py
  python3 mastodon_cz_accounts.py --output /var/www/start/

Cron (každý den v 3:00):
  0 3 * * * /usr/bin/python3 /opt/mastodon-start/mastodon_cz_accounts.py --output /var/www/start/ >> /var/log/mastodon-start.log 2>&1
"""

import json, csv, time, re, argparse, logging, os
from datetime import datetime, timezone, timedelta
from pathlib import Path
import urllib.request, urllib.error, urllib.parse

def _load_tokens():
    tokens = {}
    env_path = Path(__file__).parent / ".env"
    env_lines = env_path.read_text().splitlines() if env_path.exists() else []
    for key in ("MASTODON_TOKEN", "GTS_TOKEN"):
        val = os.environ.get(key)
        if not val:
            for line in env_lines:
                line = line.strip()
                if line.startswith(f"{key}="):
                    val = line.split("=", 1)[1].strip()
                    break
        if val:
            tokens[key] = val.strip()
    # fallback: raw token value (legacy .env bez klíče)
    if "MASTODON_TOKEN" not in tokens:
        for line in env_lines:
            line = line.strip()
            if line and not line.startswith("#") and "=" not in line:
                tokens["MASTODON_TOKEN"] = line
                break
    return tokens

_TOKENS = _load_tokens()
MASTODON_TOKEN = _TOKENS.get("MASTODON_TOKEN")
GTS_TOKEN      = _TOKENS.get("GTS_TOKEN")

def _token_for(instance: str) -> str | None:
    """Vrátí GTS_TOKEN pro GoToSocial instance (obsahují 'gts.' v doméně), jinak MASTODON_TOKEN."""
    if GTS_TOKEN and "gts." in instance:
        return GTS_TOKEN
    return MASTODON_TOKEN

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S")
log = logging.getLogger(__name__)

# ── CONFIG ────────────────────────────────────
QUERY_INSTANCES = [
    # CZ/SK instance – bereme všechny uživatele (bez language filtru)
    "mastodonczech.cz",   # 713 CZ uživatelů
    "cztwitter.cz",       # 229 CZ uživatelů
    "witter.cz",          # 212 CZ uživatelů
    "mastodon.pirati.cz", # 52 CZ uživatelů
    "f.cz",               # 40 CZ uživatelů
    "lgbtcz.social",      # 7 CZ uživatelů
    "boskovice.social",   # 5 CZ uživatelů
    "mamutovo.cz",
    "gts.arch-linux.cz",
    "kompost.cz",
    "spondr.cz",
    "skorpil.cz",
    "ajtaci.club",
    "toot.whatever.cz",
]

MIN_STATUSES      = 10
MIN_FOLLOWERS     = 10
MAX_DAYS_INACTIVE = 90
TOP_N             = 250
RATE_LIMIT_DELAY  = 1.2
PAGE_LIMIT        = 80
MAX_PAGES         = 10

# ── HTTP ──────────────────────────────────────
def api_get(url, timeout=15, token=None):
    headers = {"User-Agent": "MamutovoStarterBot/1.0 (+https://mamutovo.cz)"}
    tok = token if token is not None else MASTODON_TOKEN
    if tok:
        headers["Authorization"] = f"Bearer {tok}"
    try:
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req, timeout=timeout) as r:
            return json.loads(r.read().decode())
    except urllib.error.HTTPError as e:
        if e.code == 429:
            log.warning("Rate limit – čekám 60s"); time.sleep(60)
        elif e.code not in (404, 410):
            log.debug(f"HTTP {e.code} {url}")
        return None
    except Exception as e:
        log.debug(f"Chyba {url}: {e}"); return None

# ── SBĚR ─────────────────────────────────────
def _fetch_small_instance(instance, seen_handles, all_accounts):
    """Malé CZ/SK instance: bereme všechny uživatele z directory."""
    log.info(f"directory {instance} ...")
    token = _token_for(instance)
    page = 0
    while page < MAX_PAGES:
        offset = page * PAGE_LIMIT
        url = (f"https://{instance}/api/v1/directory"
               f"?limit={PAGE_LIMIT}&local=true&offset={offset}")
        batch = api_get(url, token=token)
        if not batch or not isinstance(batch, list):
            break
        added = 0
        for acc in batch:
            acct = acc.get("acct", "")
            handle = acct if "@" in acct else f"{acct}@{instance}"
            if handle.lower() in seen_handles:
                continue
            seen_handles.add(handle.lower())
            acc["_handle"] = handle
            acc["_source_instance"] = instance
            all_accounts.append(acc)
            added += 1
        log.debug(f"  {instance} offset={offset}: {added} nových")
        if len(batch) < PAGE_LIMIT:
            break
        page += 1
        time.sleep(RATE_LIMIT_DELAY)

def fetch_all_accounts():
    seen_handles = set()
    all_accounts = []
    for instance in QUERY_INSTANCES:
        _fetch_small_instance(instance, seen_handles, all_accounts)
        log.info(f"  → celkem {len(all_accounts)} unikátních účtů")
        time.sleep(RATE_LIMIT_DELAY)
    log.info(f"Sběr hotov: {len(all_accounts)} unikátních účtů")
    return all_accounts

def load_manual_accounts(seen_handles=None):
    """Načte manual_accounts.csv a dohledá každý účet přes /api/v1/accounts/lookup."""
    csv_path = Path(__file__).parent / "manual_accounts.csv"
    if not csv_path.exists():
        log.info("manual_accounts.csv nenalezen, přeskakuji")
        return []
    if seen_handles is None:
        seen_handles = set()
    accounts = []
    with open(csv_path, newline="", encoding="utf-8") as f:
        for row in csv.reader(f):
            if not row:
                continue
            entry = row[0].strip()
            if not entry or "@" not in entry:
                continue
            handle_part, instance = entry.rsplit("@", 1)
            handle = f"{handle_part}@{instance}"
            log.info(f"  Zpracovávám manuální účet: {handle}, v seen_handles: {handle.lower() in seen_handles}")
            url = f"https://{instance}/api/v1/accounts/lookup?acct={urllib.parse.quote(handle_part)}"
            token = _token_for(instance)
            acc = api_get(url, token=token)
            if not acc or not isinstance(acc, dict):
                log.warning(f"  {handle}: lookup selhal")
                continue
            seen_handles.add(handle.lower())
            acc["_handle"] = handle
            acc["_source_instance"] = instance
            acc["_manual"] = True
            log.info(f"  MANUAL účet přidán: {handle}, _manual={acc.get('_manual')}, statuses={acc.get('statuses_count')}")
            accounts.append(acc)
            log.debug(f"  {handle}: OK ({acc.get('followers_count', 0)} followers)")
            time.sleep(RATE_LIMIT_DELAY)
    log.info(f"Manuální účty: {len(accounts)} načteno z {csv_path.name}")
    return accounts

# ── FILTRY ────────────────────────────────────
def passes_quality(acc):
    if acc.get("suspended") or acc.get("limited"):
        return False
    if (acc.get("statuses_count")  or 0) < MIN_STATUSES:  return False
    if (acc.get("followers_count") or 0) < MIN_FOLLOWERS: return False
    last = acc.get("last_status_at")
    if not last:
        return False
    try:
        dt = datetime.fromisoformat(last.replace("Z", "+00:00"))
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        if dt < datetime.now(timezone.utc) - timedelta(days=MAX_DAYS_INACTIVE):
            log.debug(f"  vyhozen kvůli neaktivitě: {acc.get('_handle', acc.get('acct', '?'))} last_active={last}")
            return False
    except Exception:
        pass
    return True

# ── SCORING ───────────────────────────────────
def score(acc):
    followers = acc.get("followers_count", 0) or 0
    statuses  = acc.get("statuses_count",  0) or 0
    following = acc.get("following_count", 1) or 1
    f = min(40, int(40 * min(followers, 2000) / 2000))
    a = min(30, int(30 * min(statuses,  2000) / 2000))
    r = min(20, int(min(followers / max(following, 1), 4) * 5))
    handle = acc.get("_handle", "")
    instance = handle.split("@")[-1] if "@" in handle else ""
    b = 10 if any(x in instance for x in ("mamutovo", "czech")) else 0
    return min(100, f + a + r + b)

# ── KATEGORIE ─────────────────────────────────
CATEGORIES = {
    "tech":      ["linux", "python", "programov", "software", "opensource", "developer", "sysadmin", "git", "foss", "selfhosted", "homelab", "arch"],
    "foto":      ["fotografi", "foto", "photograph", "objektiv", "kamera"],
    "veda":      ["věda", "fyzika", "biologi", "astronom", "výzkum", "science", "matematik"],
    "kultura":   ["knihy", "literatura", "film", "hudba", "divadlo", "umění"],
    "gaming":    ["gaming", "hry", "videohry", "steam", "gamer"],
    "zpravy":    ["novinář", "zprávy", "politik", "média", "journalist", "zpravy", "news", "aktualne"],
    "sport":     ["sport", "fotbal", "hokej", "cycling", "running", "fitness", "tenis", "atletika", "cyklistika", "kolo", "beh", "plavani", "turistika"],
    "politika":  ["politika", "politics", "czech", "democracy", "volby", "eu"],
    "fediverse": ["fediverse", "mastodon", "activitypub", "mamutovo"],
    "cestovani": ["cestovani", "cestování", "travel", "dovolena"],
    "priroda":   ["příroda", "priroda", "les", "hory", "zahrada"],
    "jidlo":     ["jídlo", "jidlo", "vareni", "vaření", "recept", "food"],
}

def categorize(acc):
    # Primárně matchuj featured_tags proti CATEGORIES
    for tag in acc.get("_featured_tags", []):
        tag_lower = tag.lower()
        for cat, kws in CATEGORIES.items():
            if any(kw in tag_lower for kw in kws):
                return cat
    # Fallback: bio text + display_name
    text = re.sub(r"<[^>]+>", " ", acc.get("note", "") or "").lower()
    text += " " + (acc.get("display_name", "") or "").lower()
    for cat, kws in CATEGORIES.items():
        if any(kw in text for kw in kws):
            return cat
    return "ostatni"

def fetch_featured_tags(acc):
    if "_featured_tags" in acc:
        return acc["_featured_tags"]
    account_id = acc.get("id")
    instance   = acc.get("_source_instance", "")
    if not account_id or not instance:
        acc["_featured_tags"] = []
        return []
    url   = f"https://{instance}/api/v1/accounts/{account_id}/featured_tags"
    token = _token_for(instance)
    data  = api_get(url, token=token)
    if not data or not isinstance(data, list):
        acc["_featured_tags"] = []
        return []
    tags = [t["name"] for t in data if isinstance(t, dict) and t.get("name")][:6]
    acc["_featured_tags"] = tags
    return tags

# ── VÝSTUP ────────────────────────────────────
def _to_output(acc):
    handle = acc.get("_handle", acc.get("acct", ""))
    bio = re.sub(r"<[^>]+>", " ", acc.get("note", "") or "").strip()
    return {
        "name":        acc.get("display_name") or acc.get("username", ""),
        "handle":      handle,
        "bio":         bio[:220],
        "avatar":      acc.get("avatar", ""),
        "followers":   acc.get("followers_count", 0),
        "statuses":    acc.get("statuses_count",  0),
        "score":       score(acc),
        "tags":        fetch_featured_tags(acc),
        "category":    categorize(acc),
        "last_active": acc.get("last_status_at", ""),
        "url":         acc.get("url", ""),
    }

def build_output(raw):
    # Manuální účty vždy zahrnuty (bez ohledu na TOP_N)
    seen = set()
    manual = []
    for acc in raw:
        if not acc.get("_manual"):
            continue
        r = _to_output(acc)
        if r["handle"].lower() not in seen:
            seen.add(r["handle"].lower())
            manual.append(r)

    # Automatické účty doplní zbývající místa do TOP_N
    auto_candidates = []
    for acc in raw:
        if acc.get("_manual"):
            continue
        if not passes_quality(acc):
            continue
        r = _to_output(acc)
        if r["handle"].lower() not in seen:
            seen.add(r["handle"].lower())
            auto_candidates.append(r)

    auto_candidates.sort(key=lambda x: x["followers"], reverse=True)
    remaining = max(0, TOP_N - len(manual))
    return manual + auto_candidates[:remaining]

def write_json(accounts, output_dir):
    data = {"generated_at": datetime.now(timezone.utc).isoformat(), "count": len(accounts), "accounts": accounts}
    p = output_dir / "accounts.json"
    p.write_text(json.dumps(data, ensure_ascii=False, indent=2))
    log.info(f"JSON: {p} ({len(accounts)} účtů)")

def write_csv(accounts, output_dir):
    p = output_dir / "accounts.csv"
    with open(p, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["Account address", "Show boosts"])
        for a in accounts:
            w.writerow([a["handle"], "true"])
    log.info(f"CSV:  {p}")

# ── MAIN ──────────────────────────────────────
def main():
    global TOP_N
    parser = argparse.ArgumentParser()
    parser.add_argument("--output", default=".", help="Výstupní adresář")
    parser.add_argument("--top",    default=TOP_N, type=int)
    parser.add_argument("--debug",  action="store_true")
    args = parser.parse_args()
    if args.debug:
        logging.getLogger().setLevel(logging.DEBUG)
    TOP_N = args.top
    output_dir = Path(args.output)
    output_dir.mkdir(parents=True, exist_ok=True)
    log.info(f"Startuji – {len(QUERY_INSTANCES)} instancí")
    raw = fetch_all_accounts()
    seen_handles = {acc["_handle"].lower() for acc in raw}
    raw += load_manual_accounts(seen_handles)
    accounts = build_output(raw)
    if not accounts:
        log.error("Žádné účty! Zkontroluj připojení.")
        return 1
    log.info(f"Po filtraci: {len(accounts)} účtů")
    write_json(accounts, output_dir)
    write_csv(accounts, output_dir)
    log.info("Hotovo.")
    return 0

if __name__ == "__main__":
    exit(main())