# amazon_unified_tool.py
# location: amazon_unified_tool.py
# purpose: Unified CLI for Amazon seller workflows (discover, dedupe, keywords, listing, feed). Builds listings from DataDive/Helium10 CSV and computes Ranking Juice.
# non-purpose: No SP‑API uploads, no network-required dependencies. Optional Groq enhancement uses HTTP only if a key is provided.

"""
This file provides a single, maintainable Python CLI that unifies key tasks
from your Amazon docs: discovery/dedup of local artifacts (DataDive/Helium10),
keyword consolidation with Ranking Juice-style scoring, listing draft generation
(title, bullets, backend terms), and a simple flat-file (CSV) export scaffold.
It is not a GUI panel and does not call external APIs (HeyGen/OpenAI/SP-API).
"""

from __future__ import annotations

import argparse
import csv
import hashlib
import io
import json
import os
import re
import sys
import urllib.request
import urllib.error
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple

# ======================================================================================
# NOTES FOR MAINTAINERS (WHY this structure exists, not obvious syntax):
# - Single-file by design (AGENTS.md: keep codebase simple, avoid new files unless needed)
# - No external deps (network-restricted env; XLSX parsing not guaranteed) → CSV/JSON only
# - Provides safe defaults and dry-runs for filesystem actions (dedupe quarantine)
# - Listing generation follows key DataDive principles: keyword roots, anti-stuffing,
#   byte limits for backend terms, title length ≤ 200, 5 bullets with clear roles.
# - Heuristics are deterministic and documented so results are reproducible.
# ======================================================================================

# -------------------------------
# Small utility helpers
# -------------------------------

def _is_text_file(path: Path) -> bool:
    """Heuristic text-file check: only by suffix here to avoid reading large binaries."""
    return path.suffix.lower() in {
        ".txt", ".md", ".csv", ".json", ".tsv", ".html", ".htm"
    }


def _md5(path: Path, chunk_size: int = 1024 * 1024) -> str:
    """Compute md5 hash of a file; robust to large files via chunking."""
    h = hashlib.md5()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            h.update(chunk)
    return h.hexdigest()


def _human_bytes(n: int) -> str:
    """Human-friendly bytes (KiB/MiB)."""
    for unit in ["B", "KiB", "MiB", "GiB"]:
        if n < 1024 or unit == "GiB":
            return f"{n:.0f} {unit}"
        n = n / 1024.0
    return f"{n:.0f} GiB"


# -------------------------------
# Discovery & Dedupe (inspired by your prior reports)
# -------------------------------

HELIUM_PATTERNS = [
    r"helium\s*10",
    r"listing[ _-]*builder",
    r"scribbles",
    r"cerebro",
    r"magnet",
    r"xray",
]

DATADIVE_PATTERNS = [
    r"data\s*dive",
    r"datadive",
    r"ranking\s*juice",
]


def _iter_paths(roots: Sequence[Path], max_depth: int = 6) -> Iterable[Path]:
    """Yield files under roots, bounded by depth for performance and safety."""
    for root in roots:
        if not root.exists():
            continue
        root = root.resolve()
        base_depth = len(root.parts)
        for dirpath, _dirnames, filenames in os.walk(root):
            depth = len(Path(dirpath).parts) - base_depth
            if depth > max_depth:
                # Skip deeper paths
                continue
            for name in filenames:
                yield Path(dirpath) / name


def _match_any(patterns: Sequence[str], text: str) -> bool:
    return any(re.search(p, text, re.IGNORECASE) for p in patterns)


@dataclass
class FileMeta:
    path: Path
    size: int
    mtime_iso: str
    md5: Optional[str] = None


def discover(roots: Sequence[Path], max_depth: int = 6) -> List[FileMeta]:
    """Discover files related to DataDive/Helium10/Listing Builder by name or content.
    Explanation: We filter by filename first (fast), then scan content for text files only.
    """
    matches: List[FileMeta] = []
    seen: set[Path] = set()
    for p in _iter_paths(roots, max_depth=max_depth):
        name = p.name
        # Filename filters
        if _match_any(HELIUM_PATTERNS + DATADIVE_PATTERNS + [r"amazon", r"seller", r"listing"], name):
            try:
                stat = p.stat()
                matches.append(
                    FileMeta(
                        path=p,
                        size=stat.st_size,
                        mtime_iso=datetime.fromtimestamp(stat.st_mtime).isoformat(),
                    )
                )
                seen.add(p)
            except FileNotFoundError:
                continue
            except PermissionError:
                continue
            continue

        # Content filters (text files only)
        if _is_text_file(p):
            try:
                raw = p.read_text(errors="ignore")
            except Exception:
                continue
            if _match_any(HELIUM_PATTERNS + DATADIVE_PATTERNS, raw):
                try:
                    stat = p.stat()
                except Exception:
                    continue
                matches.append(
                    FileMeta(
                        path=p,
                        size=stat.st_size,
                        mtime_iso=datetime.fromtimestamp(stat.st_mtime).isoformat(),
                    )
                )
                seen.add(p)
    return matches


def compute_hashes(items: List[FileMeta]) -> None:
    """Fill md5 for each item; errors leave md5=None."""
    for it in items:
        try:
            it.md5 = _md5(it.path)
        except Exception:
            it.md5 = None


def group_duplicates(items: List[FileMeta]) -> Dict[str, List[FileMeta]]:
    """Group by identical content hash."""
    dupes: Dict[str, List[FileMeta]] = {}
    for it in items:
        if not it.md5:
            continue
        dupes.setdefault(it.md5, []).append(it)
    # Keep only groups with more than 1
    return {h: lst for h, lst in dupes.items() if len(lst) > 1}


def quarantine_duplicates(groups: Dict[str, List[FileMeta]], move_to: Path, dry_run: bool = True) -> List[Tuple[Path, Path]]:
    """Move all but the newest (by mtime) file in each dupe group into a quarantine folder.
    Returns list of (src, dst) planned/performed moves.
    """
    plan: List[Tuple[Path, Path]] = []
    move_to.mkdir(parents=True, exist_ok=True)
    for _h, lst in groups.items():
        # Sort by mtime desc; keep first
        try:
            lst_sorted = sorted(lst, key=lambda m: m.mtime_iso, reverse=True)
        except Exception:
            lst_sorted = lst
        keep = lst_sorted[0]
        for m in lst_sorted[1:]:
            # Recreate relative structure within quarantine by flattening filename only
            dst = move_to / m.path.name
            # Make destination unique if needed
            if dst.exists():
                stem = dst.stem
                suf = dst.suffix
                k = 1
                while True:
                    alt = move_to / f"{stem} ({k}){suf}"
                    if not alt.exists():
                        dst = alt
                        break
                    k += 1
            plan.append((m.path, dst))
            if not dry_run:
                try:
                    m.path.replace(dst)
                except Exception as e:
                    print(f"WARN: failed to move {m.path} → {dst}: {e}", file=sys.stderr)
    return plan


# -------------------------------
# Keyword ingestion and listing generation
# -------------------------------

@dataclass
class KeywordRow:
    keyword: str
    search_volume: int = 0
    relevancy: float = 0.0
    competition: float = 0.0
    source: str = ""
    extra: Dict[str, str] = field(default_factory=dict)
    ranking_juice: float = 0.0  # computed score


@dataclass
class KeywordDb:
    keywords: List[KeywordRow] = field(default_factory=list)

    def by_volume(self) -> List[KeywordRow]:
        return sorted(self.keywords, key=lambda k: (k.search_volume, k.relevancy), reverse=True)

    def by_rj(self) -> List[KeywordRow]:
        return sorted(self.keywords, key=lambda k: (k.ranking_juice, k.search_volume), reverse=True)

    def unique(self) -> "KeywordDb":
        seen = set()
        out: List[KeywordRow] = []
        for k in self.keywords:
            kk = k.keyword.strip().lower()
            if not kk or kk in seen:
                continue
            seen.add(kk)
            out.append(k)
        return KeywordDb(out)

    def compute_ranking_juice(self) -> None:
        """Compute a DataDive-inspired Ranking Juice score per keyword.
        Heuristic: RJ = SV × RelevancyFactor × (1-CompetitionFactor) × LengthBoost.
        - search_volume (SV): use as-is (assumes absolute volume)
        - relevancy: normalize to 0..1 (if >1, divide by 10 or 100 heuristically)
        - competition: normalize 0..1 similarly; missing → 0
        - length boost: +10% for 2-3 word phrases (roots rank better than singletons)
        """
        def norm01(val: float) -> float:
            if val <= 1:
                return max(0.0, min(1.0, val))
            # heuristic scaling
            if val <= 10:
                return val / 10.0
            if val <= 100:
                return val / 100.0
            return 1.0

        for kr in self.keywords:
            sv = max(0, kr.search_volume or 0)
            rel = norm01(kr.relevancy or 0.5)
            comp = norm01(kr.competition or 0.0)
            words = _tokenize(kr.keyword)
            length_boost = 1.1 if 2 <= len(words) <= 3 else (1.0 if len(words) == 1 else 1.05)
            # Keep factors within sensible bounds
            rel_factor = 0.5 + 0.5 * rel
            comp_factor = 0.5 + 0.5 * (1.0 - comp)
            computed = float(sv) * rel_factor * comp_factor * length_boost
            # Respect CSV RJ if present and higher
            kr.ranking_juice = max(kr.ranking_juice or 0.0, computed)


def _filter_keyword_db(db: 'KeywordDb', include: Optional[str], exclude: Optional[str]) -> 'KeywordDb':
    """Return a filtered copy of KeywordDb by regex include/exclude on keyword text."""
    if not include and not exclude:
        return db
    inc = re.compile(include, re.IGNORECASE) if include else None
    exc = re.compile(exclude, re.IGNORECASE) if exclude else None
    kept = []
    for k in db.keywords:
        kw = k.keyword or ""
        if inc and not inc.search(kw):
            continue
        if exc and exc.search(kw):
            continue
        kept.append(k)
    out = KeywordDb(kept).unique()
    out.compute_ranking_juice()
    return out


def _sniff_csv(fp: io.TextIOBase) -> csv.Dialect:
    head = fp.read(4096)
    fp.seek(0)
    try:
        return csv.Sniffer().sniff(head)
    except Exception:
        # Default to comma
        class _D(csv.Dialect):
            delimiter = ","
            quotechar = '"'
            doublequote = True
            skipinitialspace = True
            lineterminator = "\n"
            quoting = csv.QUOTE_MINIMAL
        return _D()


def load_keywords_from_csv(path: Path) -> KeywordDb:
    """Load keywords from a flexible CSV export (DataDive/Helium10).
    Accepted columns (any subset): keyword, search_volume, search_volume_exact,
    exact_search_volume, relevancy, iq_score, competition, source
    """
    with path.open("r", encoding="utf-8", errors="ignore") as f:
        dialect = _sniff_csv(f)
        reader = csv.DictReader(f, dialect=dialect)
        rows: List[KeywordRow] = []
        for r in reader:
            kw = (
                r.get("keyword")
                or r.get("Keyword")
                or r.get("term")
                or r.get("Term")
                or r.get("phrase")
                or r.get("Phrase")
                or r.get("Search Terms")
                or r.get("Search Term")
                or ""
            ).strip()
            if not kw:
                continue
            # Normalize numbers
            def to_int(x: Optional[str]) -> int:
                try:
                    return int(re.sub(r"[^0-9]", "", x or "0") or 0)
                except Exception:
                    return 0

            def to_float(x: Optional[str]) -> float:
                try:
                    return float(re.sub(r"[^0-9\.]+", "", x or "0") or 0)
                except Exception:
                    return 0.0

            sv = to_int(
                r.get("search_volume")
                or r.get("Search Volume")
                or r.get("Exact Search Volume")
                or r.get("search_volume_exact")
                or r.get("SV")
            )
            relev = to_float(
                r.get("relevancy")
                or r.get("Relevancy")
                or r.get("iq_score")
                or r.get("Relev.")
                or r.get("Rel.")
                or r.get("Rel")
            )
            comp = to_float(r.get("competition") or r.get("Competition"))
            src = (r.get("source") or r.get("Source") or "").strip()

            # Optional CSV Ranking Juice
            rj_raw = (
                r.get("Ranking Juice")
                or r.get("Ranking Juice ®")
                or r.get("Total Ranking Juice ®")
                or r.get("RJ")
                or ""
            )
            try:
                rj_val = float(re.sub(r"[^0-9\.]+", "", rj_raw)) if rj_raw else 0.0
            except Exception:
                rj_val = 0.0

            extra = {k: v for k, v in r.items() if k not in {"keyword", "Keyword", "term", "Term", "phrase", "Phrase", "Search Terms", "Search Term", "search_volume", "Search Volume", "Exact Search Volume", "search_volume_exact", "SV", "relevancy", "Relevancy", "Relev.", "Rel.", "Rel", "iq_score", "competition", "Competition", "source", "Source", "Ranking Juice", "Ranking Juice ®", "Total Ranking Juice ®", "RJ"}}
            rows.append(KeywordRow(keyword=kw, search_volume=sv, relevancy=relev, competition=comp, source=src, extra=extra, ranking_juice=rj_val))
    db = KeywordDb(rows).unique()
    db.compute_ranking_juice()
    return db


def load_keywords_from_csv_with_asin(path: Path, asin: str) -> KeywordDb:
    """Load keywords but keep only rows that reference the given ASIN when present in CSV.
    Heuristic: if any column name contains 'asin', match value to provided ASIN (case-insensitive);
    otherwise fall back to standard loader (no filtering).
    """
    asin_u = (asin or "").strip().upper()
    if not asin_u:
        return load_keywords_from_csv(path)
    with path.open("r", encoding="utf-8", errors="ignore") as f:
        dialect = _sniff_csv(f)
        reader = csv.DictReader(f, dialect=dialect)
        rows: List[KeywordRow] = []
        for r in reader:
            # Detect asin-related columns
            has_asin_cols = any((c and "asin" in c.lower()) for c in reader.fieldnames or [])
            if has_asin_cols:
                hit = False
                for k, v in r.items():
                    if k and "asin" in k.lower() and (v or "").strip().upper() == asin_u:
                        hit = True
                        break
                if not hit and not any((v or "").strip().upper() == asin_u for v in r.values()):
                    continue
            # Reuse parsing logic by synthesizing a one-row CSV
            # (Simple: copy needed fields)
            kw = (
                r.get("keyword")
                or r.get("Keyword")
                or r.get("term")
                or r.get("Term")
                or r.get("phrase")
                or r.get("Phrase")
                or r.get("Search Terms")
                or r.get("Search Term")
                or ""
            ).strip()
            if not kw:
                continue
            def to_int(x: Optional[str]) -> int:
                try: return int(re.sub(r"[^0-9]","",x or "0") or 0)
                except Exception: return 0
            def to_float(x: Optional[str]) -> float:
                try: return float(re.sub(r"[^0-9\.]+","",x or "0") or 0)
                except Exception: return 0.0
            sv = to_int(r.get("search_volume") or r.get("Search Volume") or r.get("Exact Search Volume") or r.get("search_volume_exact") or r.get("SV"))
            relev = to_float(r.get("relevancy") or r.get("Relevancy") or r.get("iq_score") or r.get("Relev.") or r.get("Rel.") or r.get("Rel"))
            comp = to_float(r.get("competition") or r.get("Competition"))
            src = (r.get("source") or r.get("Source") or path.name).strip()
            rj_raw = (
                r.get("Ranking Juice")
                or r.get("Ranking Juice ®")
                or r.get("Total Ranking Juice ®")
                or r.get("RJ")
                or ""
            )
            try:
                rj_val = float(re.sub(r"[^0-9\.]+", "", rj_raw)) if rj_raw else 0.0
            except Exception:
                rj_val = 0.0
            rows.append(KeywordRow(keyword=kw, search_volume=sv, relevancy=relev, competition=comp, source=src, ranking_juice=rj_val))
    db = KeywordDb(rows).unique()
    db.compute_ranking_juice()
    return db


# -------------- Listing generation (heuristic; no LLM) --------------

WORD_SPLIT_RE = re.compile(r"[^a-z0-9]+", re.IGNORECASE)


def _tokenize(text: str) -> List[str]:
    return [w for w in WORD_SPLIT_RE.split(text.lower()) if w]


def _truncate_bytes(s: str, byte_limit: int) -> str:
    b = s.encode("utf-8")
    if len(b) <= byte_limit:
        return s
    # Trim by bytes safely
    while len(b) > byte_limit:
        b = b[:-1]
    return b.decode("utf-8", errors="ignore")


@dataclass
class ProductInfo:
    brand: str
    product_line: str
    category: str
    attributes: List[str] = field(default_factory=list)  # e.g., ["12oz", "Whole Bean", "Insulated"]
    flavor_or_variant: Optional[str] = None


@dataclass
class ListingDraft:
    title: str
    bullets: List[str]
    backend_keywords: str
    description: str = ""
    coverage: float = 0.0


# -------------------------------
# Coverage calculation (tokenized)
# -------------------------------

def calculate_coverage(keywords: List[KeywordRow], title: str, bullets: List[str], backend: str, description: str = "") -> float:
    """Calculate tokenized coverage percentage (top 200 keywords).

    Tokenized coverage: checks if ANY word from keyword phrase exists in listing.
    This is more lenient than exact phrase matching.

    Args:
        keywords: List of KeywordRow objects (sorted by RJ)
        title: Title text
        bullets: List of bullet points
        backend: Backend keywords
        description: Description text (optional)

    Returns:
        Coverage percentage (0-100)

    Example:
        - Keyword: "cutting board set"
        - Title: "Bamboo Cutting Board for Kitchen"
        - Match: YES (contains "cutting" and "board")
        - Coverage contribution: 1/200
    """
    import re

    # Get top 200 keywords
    top_keywords = keywords[:200]
    if not top_keywords:
        return 0.0

    # Build full listing text
    listing_text = f"{title} {' '.join(bullets)} {backend} {description}".lower()

    # Tokenize listing (extract all meaningful words)
    listing_words = set(re.findall(r'[a-z0-9]+', listing_text))

    # Count covered keywords
    covered = 0
    for kw_row in top_keywords:
        kw_text = kw_row.keyword.lower()
        # Tokenize keyword
        kw_words = set(re.findall(r'[a-z0-9]+', kw_text))

        # Check if ANY word from keyword exists in listing
        if kw_words & listing_words:  # Set intersection
            covered += 1

    return (covered / len(top_keywords)) * 100.0


# -------------------------------
# Optional: Groq AI enhancement
# -------------------------------

def _groq_chat_completion(messages: List[Dict[str, str]], model: str, api_key: str, temperature: float = 0.3, timeout: int = 30) -> Optional[str]:
    """Call Groq's OpenAI-compatible chat completions endpoint. Returns content or None on failure.
    Uses urllib to avoid external deps. Requires env GROQ_API_KEY or provided key.
    """
    url = "https://api.groq.com/openai/v1/chat/completions"
    payload = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
        "response_format": {"type": "json_object"},
    }
    data = json.dumps(payload).encode("utf-8")
    req = urllib.request.Request(url, data=data, method="POST")
    req.add_header("Content-Type", "application/json")
    req.add_header("Authorization", f"Bearer {api_key}")
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            body = resp.read().decode("utf-8", errors="ignore")
            obj = json.loads(body)
            # OpenAI-like shape
            return obj.get("choices", [{}])[0].get("message", {}).get("content")
    except Exception:
        return None


def _extract_json_block(text: str) -> Optional[dict]:
    """Try to parse JSON from text which may include extra prose or code fences."""
    if not text:
        return None
    # Common case: pure JSON
    try:
        return json.loads(text)
    except Exception:
        pass
    # Try to find a code block
    m = re.search(r"\{[\s\S]*\}", text)
    if m:
        try:
            return json.loads(m.group(0))
        except Exception:
            return None
    return None


def enhance_listing_with_groq(prod: 'ProductInfo', kwdb: 'KeywordDb', draft: ListingDraft, api_key: str, model: str = "llama-3.1-70b-versatile") -> ListingDraft:
    """Use Groq to refine title/bullets/backend within strict Amazon rules. Falls back on draft if anything fails."""
    # Prepare top keywords by RJ
    top = kwdb.by_rj()[:120]
    top_keywords = [
        {
            "keyword": r.keyword,
            "volume": r.search_volume,
            "relevancy": r.relevancy,
            "competition": r.competition,
            "rj": round(r.ranking_juice, 2),
        }
        for r in top
    ]
    system = (
        "You are an expert Amazon SEO copywriter. Follow these hard constraints: "
        "Title ≤ 200 chars; 5 bullets ≤ 500 chars each; backend keywords ≤ 250 bytes; "
        "no brand names in backend; avoid keyword stuffing (≤2x word in title, ≤3x in listing)."
    )
    user = {
        "product": {
            "brand": prod.brand,
            "product_line": prod.product_line,
            "category": prod.category,
            "attributes": prod.attributes,
            "variant": prod.flavor_or_variant,
        },
        "keywords": top_keywords,
        "current_draft": {
            "title": draft.title,
            "bullets": draft.bullets,
            "backend_keywords": draft.backend_keywords,
        },
        "instructions": (
            "Improve the listing using the highest-RJ keyword phrases naturally. "
            "Return strict JSON: {title: string, bullets: string[5], backend_keywords: string}. "
            "Keep language concise and benefit-focused."
        ),
    }
    content = _groq_chat_completion(
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": json.dumps(user, ensure_ascii=False)},
        ],
        model=model,
        api_key=api_key,
        temperature=0.3,
    )
    enhanced = _extract_json_block(content) if content else None
    if not enhanced:
        return draft
    # Validate shape
    title = enhanced.get("title") if isinstance(enhanced, dict) else None
    bullets = enhanced.get("bullets") if isinstance(enhanced, dict) else None
    backend = enhanced.get("backend_keywords") if isinstance(enhanced, dict) else None
    if not title or not isinstance(bullets, list) or len(bullets) != 5 or backend is None:
        return draft
    # Final guardrails
    title = title.strip()[:200]
    bullets = [str(b or "")[:500] for b in bullets]
    # Enforce backend byte limit by truncation at token boundary
    bk = backend.strip()
    while len(bk.encode("utf-8")) > 250 and "," in bk:
        bk = ",".join(bk.split(",")[:-1])
    if len(bk.encode("utf-8")) > 250:
        bk = _truncate_bytes(bk, 250)
    return ListingDraft(title=title, bullets=bullets, backend_keywords=bk)


def validate_listing(title: str, bullets: List[str], backend_keywords: str, aggressive: bool = False) -> Dict[str, object]:
    """Return basic compliance metrics aligned with Amazon/DataDive heurystyki.

    Args:
        title: Listing title
        bullets: List of bullet points
        backend_keywords: Backend keywords string
        aggressive: If True, use relaxed limits (≤3× title, ≤5× listing vs ≤2×/≤3×)

    Returns:
        Dict with validation metrics including:
        - title_len, title_ok (≤200 chars)
        - bullets_count, bullets_ok (5 bullets)
        - backend_bytes, backend_ok (≤250 bytes)
        - title_word_repetition (violations based on mode)
        - listing_word_repetition (violations based on mode)
        - keyword_density (percentage)
    """
    # Set limits based on mode
    max_title_repetition = 3 if aggressive else 2
    max_listing_repetition = 5 if aggressive else 3

    # Title length
    title_len = len(title)
    title_ok = title_len <= 200
    # Bullets
    bullets_count = len(bullets)
    bullets_ok = bullets_count == 5 and all(isinstance(b, str) and b for b in bullets)
    # Backend byte limit
    backend_bytes = len((backend_keywords or "").encode("utf-8"))
    backend_ok = backend_bytes <= 250

    # Anti-stuffing checks (only meaningful words >3 chars)
    from collections import Counter
    t_tokens = [w for w in _tokenize(title) if len(w) > 3]
    l_tokens = [w for w in _tokenize(title + " " + " ".join(bullets)) if len(w) > 3]
    t_freq = Counter(t_tokens)
    l_freq = Counter(l_tokens)
    title_viol = {w: c for w, c in t_freq.items() if c > max_title_repetition}
    listing_viol = {w: c for w, c in l_freq.items() if c > max_listing_repetition}

    # Keyword density calculation (% of unique meaningful words)
    total_words = len(l_tokens)
    unique_words = len(set(l_tokens))
    keyword_density = (1.0 - unique_words / total_words) if total_words > 0 else 0.0

    return {
        "title_len": title_len,
        "title_ok": title_ok,
        "bullets_count": bullets_count,
        "bullets_ok": bullets_ok,
        "backend_bytes": backend_bytes,
        "backend_ok": backend_ok,
        "title_word_repetition": title_viol,
        "listing_word_repetition": listing_viol,
        "keyword_density": keyword_density,
        "density_ok": keyword_density <= 0.08,  # 8% max (relaxed from 6%)
        "mode": "aggressive" if aggressive else "standard",
        "limits": {
            "title_repetition": f"≤{max_title_repetition}×",
            "listing_repetition": f"≤{max_listing_repetition}×",
            "density": "≤8%" if aggressive else "≤6%"
        }
    }


def _choose_roots_by_rj(kws: KeywordDb, limit: int = 6) -> List[str]:
    """Pick top unique root phrases (2-3 words) by Ranking Juice.
    Avoid near-duplicates by requiring that a candidate root is not a substring of a chosen root and vice versa.
    """
    chosen: List[str] = []
    for row in kws.by_rj():
        words = _tokenize(row.keyword)
        if len(words) < 2:
            continue
        root = " ".join(words[:3])
        if any(root in r or r in root for r in chosen):
            continue
        chosen.append(root)
        if len(chosen) >= limit:
            break
    return chosen


def _dedupe_roots_tokens(roots: List[str], brand: str, product_line: str, max_repeat: int = 2) -> List[str]:
    """Greedy filter of roots so that no token appears more than max_repeat times
    when combined with brand/product line tokens.
    """
    freq: Dict[str, int] = {}
    for t in _tokenize(brand + " " + product_line):
        freq[t] = freq.get(t, 0) + 1
    kept: List[str] = []
    for r in roots:
        toks = _tokenize(r)
        ok = True
        for t in toks:
            if freq.get(t, 0) + 1 > max_repeat:
                ok = False
                break
        if ok:
            kept.append(r)
            for t in toks:
                freq[t] = freq.get(t, 0) + 1
    return kept


def _limit_repetitions(text: str, max_per_word: int = 2) -> str:
    """Limit repeated words to avoid stuffing. Case-insensitive, ASCII-word based."""
    words = _tokenize(text)
    freq: Dict[str, int] = {}
    kept: List[str] = []
    for w in words:
        c = freq.get(w, 0)
        if c < max_per_word:
            kept.append(w)
            freq[w] = c + 1
    return " ".join(kept)


def generate_title(prod: ProductInfo, kws: KeywordDb, max_len: int = 200, aggressive: bool = False) -> str:
    """Compose an SEO-aware title using top keywords and product attributes.
    Strategy (DataDive inspired):
    - Begin with Brand + Product Line
    - Include highest-volume root keywords once
    - Append key attributes and unique benefit
    - Enforce max_len and anti-stuffing (≤2 repeats per word in normal mode, ≤3 in aggressive)

    Args:
        prod: Product information
        kws: Keyword database sorted by Ranking Juice
        max_len: Maximum title length (default 200)
        aggressive: If True, use aggressive packing mode (7-9 phrases, ≤3× repetition)
    """
    if aggressive:
        return _generate_title_max_rj(prod, kws, max_len)

    # ORIGINAL LOGIC (non-aggressive mode)
    parts: List[str] = []
    parts.append(f"{prod.brand} {prod.product_line}".strip())

    # Take top roots by Ranking Juice (DataDive-style)
    roots: List[str] = _choose_roots_by_rj(kws, limit=6)
    # Enforce anti-stuffing across roots (max 2 per token with brand/product)
    roots = _dedupe_roots_tokens(roots, prod.brand, prod.product_line, max_repeat=2)[:4]
    if roots:
        parts.append(" • ".join(roots))

    # Attributes (size, variant, materials) — avoid duplicates already covered by roots
    attrs: List[str] = []
    root_tokens = set()
    for r in roots:
        root_tokens.update(_tokenize(r))
    for a in prod.attributes[:4]:
        atoks = set(_tokenize(a))
        # Keep attribute if it adds at least one new token
        if not atoks.issubset(root_tokens):
            attrs.append(a)
    if attrs:
        parts.append(" | ".join(attrs))
    if prod.flavor_or_variant:
        parts.append(prod.flavor_or_variant)

    # Unique benefit placeholder (generic, safe across categories)
    parts.append("Durable & Easy To Clean")

    title = " - ".join([p for p in parts if p])

    # Truncate to max_len (keep separators for readability)
    if len(title) > max_len:
        title = title[:max_len].rstrip()
    return title


def _generate_title_max_rj(prod: ProductInfo, kws: KeywordDb, max_len: int = 200) -> str:
    """AGGRESSIVE TITLE: Maximum Ranking Juice with EXACT phrases (dash-separated).

    Strategy:
    - Brand + Product Line
    - Add top 7-9 keywords as "phrase1 - phrase2 - phrase3"
    - NO connectors (with/for) - only dashes to preserve EXACT matches
    - Fill up to 197 chars (3 char safety margin)
    - Track word repetition (≤3× per meaningful word, relaxed from ≤2×)

    This aggressive mode maximizes Ranking Juice by:
    1. Packing more EXACT phrases (7-9 vs 3-4)
    2. Using dash separators only (no "with"/"for" that break EXACT matches)
    3. Relaxed repetition limit (3× vs 2×) to allow more high-RJ keywords

    Example Output:
    "HAG EXPRESS Bamboo Cutting Board - cutting board set - large cutting board -
    juice groove - bamboo board - reversible board - kitchen board - non slip"
    """
    import re

    # Base title: Brand + Product Line
    title = f"{prod.brand} {prod.product_line}".strip()

    # Word frequency tracker (only meaningful words >3 chars)
    def count_words(text: str) -> Dict[str, int]:
        words = re.findall(r"[a-z0-9]+", text.lower())
        counts: Dict[str, int] = {}
        for w in words:
            if len(w) > 3:  # Only meaningful words
                counts[w] = counts.get(w, 0) + 1
        return counts

    word_freq = count_words(title)

    # Get top keywords by RJ
    top_keywords = kws.by_rj()[:20]  # Try top 20 to fit 7-9

    # Greedy packing: add phrases until 197 chars (3 char safety margin)
    added_count = 0
    for kw_row in top_keywords:
        phrase = kw_row.keyword.strip()

        # Skip if empty or already in title
        if not phrase or phrase.lower() in title.lower():
            continue

        # Check if phrase would violate repetition (≤3× per word, relaxed from ≤2×)
        phrase_words = count_words(phrase)
        would_violate = any(
            word_freq.get(w, 0) + c > 3  # RELAXED to 3× (was 2×)
            for w, c in phrase_words.items()
        )

        if would_violate:
            continue

        # Try adding with dash separator
        candidate = f"{title} - {phrase}"

        if len(candidate) <= 197:  # Safe margin (3 chars for normalization)
            title = candidate
            added_count += 1
            # Update word freq
            for w, c in phrase_words.items():
                word_freq[w] = word_freq.get(w, 0) + c

            # Stop after 9 phrases to avoid over-stuffing
            if added_count >= 9:
                break

    # Final cleanup: remove double spaces
    title = re.sub(r'\s+', ' ', title).strip()

    # Hard limit to 200 chars
    if len(title) > 200:
        title = title[:200].rstrip()

    return title


def generate_bullets(prod: ProductInfo, kws: KeywordDb, max_chars: int = 500) -> List[str]:
    """Generate 5 bullets with roles: Problem+Solution, Features, Differentiators, Use Cases, Brand/Guarantee.
    Keywords: distribute remaining high-volume phrases, avoid repetition.
    """
    top = kws.by_rj()[:40]
    phrases = [kr.keyword for kr in top]

    def clip(s: str) -> str:
        return s[:max_chars]

    bullets: List[str] = []
    bullets.append(clip(f"Solve Common Prep Frustrations: {prod.product_line} helps with clean, efficient prep{(' using ' + ', '.join(prod.attributes[:2])) if prod.attributes else ''}."))
    short_phrases = [p for p in phrases if len(_tokenize(p)) <= 3]
    bullets.append(clip(f"Key Features: {', '.join(prod.attributes[:4]) if prod.attributes else 'Essential design'}. Optimized for {', '.join(short_phrases[:3])}."))
    bullets.append(clip(f"Why It’s Better: Built for reliability and ease of use{(' — ' + prod.flavor_or_variant) if prod.flavor_or_variant else ''}."))
    use_for = [p for p in short_phrases[3:7]]
    uf = ", ".join(use_for) if use_for else "daily kitchen tasks"
    bullets.append(clip(f"Use It For: {uf}. Suitable for home, gifts, and everyday prep."))
    bullets.append(clip(f"Brand Promise: {prod.brand} quality. Support included."))

    return bullets[:5] if len(bullets) >= 5 else bullets + [""] * (5 - len(bullets))


def generate_backend_terms(title: str, bullets: List[str], kws: KeywordDb, byte_limit: int = 250, brand_blocklist: Optional[List[str]] = None, aggressive: bool = False) -> str:
    """Select unused high-value keywords and pack under 250 bytes with greedy packing algorithm.

    Args:
        title: Title text to exclude
        bullets: Bullet points to exclude
        kws: Keyword database sorted by RJ
        byte_limit: Maximum bytes (default 250)
        brand_blocklist: List of brand names to exclude
        aggressive: If True, use space-separated (more byte-efficient) and pack to 249 bytes

    Strategy (Aggressive Mode):
    1. Filter out keywords in title/bullets (substring match)
    2. Sort by length (shortest first for better packing)
    3. Greedy knapsack: add until 249 bytes (1 byte safety margin)
    4. Space-separated (no commas - saves bytes)

    Returns:
        Backend keywords string (comma or space-separated)
    """
    if aggressive:
        return _generate_backend_max_packed(title, bullets, kws, byte_limit - 1, brand_blocklist)

    # ORIGINAL LOGIC (non-aggressive mode, comma-separated)
    used = set(_tokenize(title))
    for b in bullets:
        used.update(_tokenize(b))

    out_terms: List[str] = []
    for kr in kws.by_rj():
        term = kr.keyword.strip().lower()
        if not term:
            continue
        toks = _tokenize(term)
        if not toks:
            continue
        # Allow term if not fully covered by used tokens
        if all(w in used for w in toks):
            continue
        # Avoid obvious brand placeholders (customize list as needed)
        badlist = ["starbucks", "keurig", prod_brand_blocklist_placeholder()]
        if brand_blocklist:
            badlist.extend([b.lower() for b in brand_blocklist])
        if any(bad in term for bad in badlist):
            continue
        tentative = ",".join(out_terms + [term])
        if len(tentative.encode("utf-8")) <= byte_limit:
            out_terms.append(term)
        else:
            break
    return ",".join(out_terms)


def _generate_backend_max_packed(title: str, bullets: List[str], kws: KeywordDb, limit_bytes: int = 249, brand_blocklist: Optional[List[str]] = None) -> str:
    """GREEDY PACKING: Fill backend to 249 bytes (1 byte safety margin).

    Strategy:
    1. Filter out keywords already in title/bullets (substring match)
    2. Sort by length (shortest first for better packing efficiency)
    3. Greedy knapsack: add keywords until 249 bytes
    4. Space-separated (no commas - saves 1 byte per keyword)

    This maximizes coverage by:
    - Using space separators instead of commas (saves ~20-30 bytes)
    - Sorting by length for optimal knapsack packing
    - Substring matching to avoid any duplication
    - Filling to exact byte limit (249 bytes)

    Example Output:
    "wooden board set acacia butcher block chopping carving cheese serving knife..."
    (vs comma-separated: "wooden board,acacia board,butcher block,chopping..." wastes bytes)
    """
    title_lower = title.lower()
    bullets_text = ' '.join(bullets).lower()

    # Build brand blocklist
    badlist = ["starbucks", "keurig", prod_brand_blocklist_placeholder()]
    if brand_blocklist:
        badlist.extend([b.lower() for b in brand_blocklist])

    # Filter unused keywords (substring check)
    unused = []
    for kr in kws.by_rj():
        kw_lower = kr.keyword.strip().lower()
        if not kw_lower:
            continue

        # Skip if substring in title or bullets
        if kw_lower in title_lower or kw_lower in bullets_text:
            continue

        # Skip brand names
        if any(bad in kw_lower for bad in badlist):
            continue

        unused.append(kw_lower)

    # Sort by length (shortest first = better packing efficiency)
    unused.sort(key=len)

    # Greedy packing with space separator
    backend_parts: List[str] = []
    current_bytes = 0

    for kw in unused:
        # Calculate bytes with space separator
        kw_bytes = len(kw.encode('utf-8'))
        test_bytes = current_bytes + kw_bytes
        if backend_parts:
            test_bytes += 1  # Space separator

        if test_bytes <= limit_bytes:
            backend_parts.append(kw)
            current_bytes = test_bytes
        # Don't break - keep trying shorter keywords that might fit

    backend = ' '.join(backend_parts)

    # Final safety trim (should never trigger with greedy packing)
    while len(backend.encode('utf-8')) > limit_bytes and backend_parts:
        backend_parts.pop()
        backend = ' '.join(backend_parts)

    return backend


def prod_brand_blocklist_placeholder() -> str:
    # Placeholder allowing easy extension without external config
    return "yourbrand"


def generate_description(prod: ProductInfo, kws: KeywordDb, title: str, bullets: List[str]) -> str:
    """Generate simple HTML description with remaining keywords.

    Uses keywords not in title/bullets to create a basic description.
    Max 2000 characters.
    """
    # Get keywords not used in title/bullets
    used_text = f"{title} {' '.join(bullets)}".lower()
    unused_kws = []

    for kw_row in kws.by_rj()[:100]:
        kw = kw_row.keyword.strip()
        if kw.lower() not in used_text:
            unused_kws.append(kw)

    # Build simple description
    desc_parts = [
        f"<p><strong>Premium {prod.product_line}</strong> from {prod.brand}</p>",
        "<p>Features include:</p>",
        "<ul>"
    ]

    # Add keywords as list items
    for kw in unused_kws[:20]:
        desc_parts.append(f"<li>{kw.capitalize()}</li>")

    desc_parts.append("</ul>")

    # Add more keywords in paragraph form
    if len(unused_kws) > 20:
        desc_parts.append(f"<p>Perfect for {', '.join(unused_kws[20:40])}.</p>")

    description = '\n'.join(desc_parts)

    # Limit to 2000 chars
    return description[:2000]


def boost_coverage_with_description(
    kwdb: KeywordDb,
    title: str,
    bullets: List[str],
    backend: str,
    description: str,
    target_coverage: float = 90.0
) -> Tuple[str, str, float]:
    """Boost coverage by adding missing keywords to description and backend.

    Strategy:
    1. Calculate current coverage
    2. Find missing keywords (top 200)
    3. Add missing keywords to description (up to 2000 chars)
    4. Pack remaining into backend (up to 249 bytes)
    5. Return updated description, backend, and new coverage

    Args:
        kwdb: Keyword database
        title: Title text
        bullets: Bullet points
        backend: Current backend keywords
        description: Current description
        target_coverage: Target coverage % (default 90)

    Returns:
        Tuple of (new_backend, new_description, new_coverage)
    """
    import re

    # Calculate initial coverage
    coverage = calculate_coverage(kwdb.keywords, title, bullets, backend, description)

    if coverage >= target_coverage:
        return backend, description, coverage

    # Build current listing text
    listing_text = f"{title} {' '.join(bullets)} {backend} {description}".lower()
    listing_words = set(re.findall(r'[a-z0-9]+', listing_text))

    # Find missing keywords (top 200)
    missing_phrases = []
    for kw_row in kwdb.by_rj()[:200]:
        kw_text = kw_row.keyword.strip()
        kw_words = set(re.findall(r'[a-z0-9]+', kw_text.lower()))

        # If ANY word is missing, add entire phrase
        if not (kw_words & listing_words):
            missing_phrases.append(kw_text)

    if not missing_phrases:
        return backend, description, coverage

    # Add to description (bulleted list at end)
    if len(description) < 1800 and missing_phrases:
        additions = missing_phrases[:30]  # Top 30 missing
        desc_addition = "\n\n<p>Also great for:</p>\n<ul>\n"
        desc_addition += '\n'.join(f"<li>{p}</li>" for p in additions[:15])
        desc_addition += "\n</ul>"

        new_description = (description + desc_addition)[:2000]
    else:
        new_description = description

    # Pack short phrases into backend (space-separated)
    backend_parts = backend.split() if backend else []
    current_bytes = len(' '.join(backend_parts).encode('utf-8'))

    short_missing = [p for p in missing_phrases if len(p) <= 20][:40]

    for phrase in short_missing:
        phrase_lower = phrase.lower()
        # Skip if already in backend
        if phrase_lower in backend.lower():
            continue

        test_bytes = current_bytes + len(phrase_lower.encode('utf-8')) + 1
        if test_bytes <= 249:
            backend_parts.append(phrase_lower)
            current_bytes = test_bytes

    new_backend = ' '.join(backend_parts)

    # Re-calculate coverage
    new_coverage = calculate_coverage(kwdb.keywords, title, bullets, new_backend, new_description)

    return new_backend, new_description, new_coverage


def build_listing(prod: ProductInfo, kwdb: KeywordDb, aggressive: bool = False, boost_coverage: bool = False, target_coverage: float = 90.0) -> ListingDraft:
    """Build a listing draft from product info and keywords.

    Args:
        prod: Product information
        kwdb: Keyword database sorted by RJ
        aggressive: If True, use aggressive optimization (7-9 phrases in title, relaxed limits)
        boost_coverage: If True, add description and boost coverage to target
        target_coverage: Target coverage % when boost_coverage=True (default 90)
    """
    title = generate_title(prod, kwdb, aggressive=aggressive)
    bullets = generate_bullets(prod, kwdb)
    # Anti-stuffing across listing (title + bullets): overall ≤3 per word (or ≤5 in aggressive mode)
    max_repetition = 5 if aggressive else 3
    joined = title + " " + " ".join(bullets)
    cleaned_joined = _limit_repetitions(joined, max_per_word=max_repetition)
    # Re-split cleaned back into title and bullets if anything was heavily repeated
    title = generate_title(prod, kwdb, aggressive=aggressive) if cleaned_joined != joined else title
    backend = generate_backend_terms(title, bullets, kwdb, brand_blocklist=[prod.brand], aggressive=aggressive)

    # Generate description
    description = generate_description(prod, kwdb, title, bullets)

    # Calculate initial coverage
    coverage = calculate_coverage(kwdb.keywords, title, bullets, backend, description)

    # Boost coverage if requested
    if boost_coverage and coverage < target_coverage:
        backend, description, coverage = boost_coverage_with_description(
            kwdb, title, bullets, backend, description, target_coverage
        )

    return ListingDraft(
        title=title,
        bullets=bullets,
        backend_keywords=backend,
        description=description,
        coverage=coverage
    )


# -------------------------------
# Feed export (CSV scaffold)
# -------------------------------

FEED_COLUMNS = [
    "item_sku",
    "item_name",
    "brand_name",
    "product_description",
    "bullet_point1",
    "bullet_point2",
    "bullet_point3",
    "bullet_point4",
    "bullet_point5",
    "generic_keywords",
]


def export_feed_csv(path: Path, prod: ProductInfo, listing: ListingDraft) -> None:
    """Write a simple flat-file-like CSV for manual upload tweaking.
    Note: Real Amazon category flat files vary by product type; this is a neutral scaffold.
    """
    item_name = listing.title
    # Use description if available, otherwise fallback to bullets
    desc = listing.description if listing.description else " ".join(listing.bullets)

    with path.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FEED_COLUMNS)
        w.writeheader()
        w.writerow({
            "item_sku": f"{prod.brand[:8]}-{prod.product_line[:12]}-SKU".upper(),
            "item_name": item_name,
            "brand_name": prod.brand,
            "product_description": desc[:2000],
            "bullet_point1": listing.bullets[0] if listing.bullets else "",
            "bullet_point2": listing.bullets[1] if len(listing.bullets) > 1 else "",
            "bullet_point3": listing.bullets[2] if len(listing.bullets) > 2 else "",
            "bullet_point4": listing.bullets[3] if len(listing.bullets) > 3 else "",
            "bullet_point5": listing.bullets[4] if len(listing.bullets) > 4 else "",
            "generic_keywords": listing.backend_keywords,
        })


# -------------------------------
# CLI
# -------------------------------


def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
    p = argparse.ArgumentParser(
        prog="amazon_unified_tool",
        description="Unified Amazon seller CLI: discover, dedupe, keywords, listings, feed",
    )

    sub = p.add_subparsers(dest="cmd", required=True)

    # discover
    d = sub.add_parser("discover", help="Find DataDive/Helium10/listing-related files")
    d.add_argument("--roots", nargs="*", default=["~/Downloads", "~/Documents", "~/Desktop"], help="Root dirs to scan")
    d.add_argument("--max-depth", type=int, default=6)
    d.add_argument("--json", action="store_true", help="Emit JSON output")

    # dedupe
    dd = sub.add_parser("dedupe", help="Group duplicates by hash and optionally move older copies")
    dd.add_argument("--roots", nargs="*", default=["~/Downloads", "~/Documents", "~/Desktop"], help="Root dirs to scan")
    dd.add_argument("--quarantine", type=str, default="~/Desktop/_dedup_quarantine", help="Where to move duplicates")
    dd.add_argument("--apply", action="store_true", help="Perform moves (otherwise dry-run)")

    # keywords
    k = sub.add_parser("keywords", help="Load keywords from CSV and print top entries")
    k.add_argument("csv_path", help="CSV export with keyword column")
    k.add_argument("--top", type=int, default=25)
    k.add_argument("--json", action="store_true")

    # keywords-merge (merge multiple CSVs into a master list with Ranking Juice)
    km = sub.add_parser("keywords-merge", help="Merge CSVs (DataDive/Helium10) and output Master Keyword List")
    km.add_argument("csv_paths", nargs="+", help="One or more CSV exports")
    km.add_argument("--out", default=None, help="Optional output CSV path; default prints JSON")
    km.add_argument("--top", type=int, default=500, help="Limit rows in output")
    km.add_argument("--asin", default=None, help="Optional ASIN filter; keeps rows referencing this ASIN when available")
    km.add_argument("--include", default=None, help="Regex: keep only keywords matching this pattern")
    km.add_argument("--exclude", default=None, help="Regex: drop keywords matching this pattern")

    # listing
    l = sub.add_parser("listing", help="Build listing (title, bullets, backend terms)")
    l.add_argument("csv_paths", nargs="+", help="One or more Keyword CSVs (DataDive/Helium10)")
    l.add_argument("--brand", required=True)
    l.add_argument("--product-line", required=True, help="Product name/line")
    l.add_argument("--category", required=True, help="High-level category (e.g., Coffee)")
    l.add_argument("--attr", nargs="*", default=[], help="Attributes like size/material (e.g., 12oz 'Whole Bean')")
    l.add_argument("--variant", default=None, help="Flavor or variant label")
    l.add_argument("--feed-out", default=None, help="Optional path to write a simple feed CSV")
    l.add_argument("--validate", action="store_true", help="Print validation metrics for the generated listing")
    l.add_argument("--aggressive", action="store_true", help="🏆 AGGRESSIVE MODE: Max optimization (7-9 phrases in title, ≤3× repetition, ≤5× listing)")
    l.add_argument("--boost-coverage", action="store_true", help="📊 COVERAGE BOOSTER: Add description + pack missing keywords (target ≥90%)")
    l.add_argument("--coverage-target", type=float, default=90.0, help="Target coverage %% (default: 90, use with --boost-coverage)")
    l.add_argument("--groq", action="store_true", help="Use Groq AI to enhance the generated listing")
    l.add_argument("--groq-key", default=None, help="GROQ_API_KEY (falls back to env)")
    l.add_argument("--groq-model", default="llama-3.1-70b-versatile", help="Groq model id")
    l.add_argument("--json", action="store_true")
    l.add_argument("--include", default=None, help="Regex: keep only keywords matching this pattern")
    l.add_argument("--exclude", default=None, help="Regex: drop keywords matching this pattern")

    # proposal (email content for Upwork outreach)
    pr = sub.add_parser("proposal", help="Generate Upwork proposal email (subject + body)")
    pr.add_argument("--client-name", required=True)
    pr.add_argument("--budget", default="")
    pr.add_argument("--project-notes", default="", help="Short notes about the client/project")
    pr.add_argument("--video-url", default="", help="Optional AI video URL to include")
    pr.add_argument("--json", action="store_true")

    # email-bulk-preview (CSV → subjects/bodies)
    eb = sub.add_parser("email-bulk-preview", help="CSV to preview outreach emails (no sending)")
    eb.add_argument("csv_path", help="CSV with name,email,budget,notes (flexible headers)")
    eb.add_argument("--limit", type=int, default=20)
    eb.add_argument("--json", action="store_true")

    return p.parse_args(argv)


def main(argv: Optional[Sequence[str]] = None) -> int:
    args = _parse_args(argv)

    if args.cmd == "discover":
        roots = [Path(os.path.expanduser(p)) for p in args.roots]
        items = discover(roots, max_depth=args.max_depth)
        compute_hashes(items)
        out = [
            {
                "path": str(m.path),
                "size_bytes": m.size,
                "mtime_iso": m.mtime_iso,
                "md5": m.md5,
            }
            for m in items
        ]
        if args.json:
            print(json.dumps(out, ensure_ascii=False, indent=2))
        else:
            print(f"{len(out)} matches")
            for m in out[:50]:
                print(f"- {m['path']} ({_human_bytes(m['size_bytes'])})")
            if len(out) > 50:
                print(f"… and {len(out) - 50} more")
        return 0

    if args.cmd == "dedupe":
        roots = [Path(os.path.expanduser(p)) for p in args.roots]
        items = discover(roots)
        compute_hashes(items)
        groups = group_duplicates(items)
        print(f"{len(groups)} duplicate groups found")
        if not groups:
            return 0
        quarantine = Path(os.path.expanduser(args.quarantine))
        plan = quarantine_duplicates(groups, quarantine, dry_run=not args.apply)
        for src, dst in plan[:200]:
            print(("MOVE" if args.apply else "PLAN"), str(src), "→", str(dst))
        if len(plan) > 200:
            print(f"… and {len(plan) - 200} more")
        if not args.apply:
            print("(dry-run) Use --apply to perform moves")
        else:
            print(f"Moved {len(plan)} files into {quarantine}")
        return 0

    if args.cmd == "keywords":
        path = Path(args.csv_path)
        if not path.exists():
            print(f"ERROR: CSV not found: {path}", file=sys.stderr)
            return 2
        db = load_keywords_from_csv(path)
        rows = db.by_rj()[: args.top]
        out = [{
            "keyword": r.keyword,
            "search_volume": r.search_volume,
            "relevancy": r.relevancy,
            "competition": r.competition,
            "ranking_juice": round(r.ranking_juice, 2),
        } for r in rows]
        if args.json:
            print(json.dumps(out, ensure_ascii=False, indent=2))
        else:
            for r in out:
                print(f"- {r['keyword']} (RJ: {r['ranking_juice']}, vol: {r['search_volume']}, rel: {r['relevancy']})")
        return 0

    if args.cmd == "keywords-merge":
        paths = [Path(p) for p in args.csv_paths]
        all_rows: List[KeywordRow] = []
        for pth in paths:
            if not pth.exists():
                print(f"WARN: Skipping missing CSV: {pth}", file=sys.stderr)
                continue
            if args.asin:
                db = load_keywords_from_csv_with_asin(pth, args.asin)
            else:
                db = load_keywords_from_csv(pth)
            all_rows.extend(db.keywords)
        merged = KeywordDb(all_rows).unique()
        merged.compute_ranking_juice()
        merged = _filter_keyword_db(merged, args.include, args.exclude)
        top_rows = merged.by_rj()[: args.top]
        if args.out:
            outp = Path(args.out)
            with outp.open("w", newline="", encoding="utf-8") as f:
                w = csv.writer(f)
                w.writerow(["keyword", "search_volume", "relevancy", "competition", "ranking_juice", "source"])
                for r in top_rows:
                    w.writerow([r.keyword, r.search_volume, r.relevancy, r.competition, f"{r.ranking_juice:.2f}", r.source])
            print(f"Master Keyword List saved: {outp}")
        else:
            out = [{
                "keyword": r.keyword,
                "search_volume": r.search_volume,
                "relevancy": r.relevancy,
                "competition": r.competition,
                "ranking_juice": round(r.ranking_juice, 2),
                "source": r.source,
            } for r in top_rows]
            print(json.dumps(out, ensure_ascii=False, indent=2))
        return 0

    if args.cmd == "listing":
        # Merge all provided CSVs
        all_rows: List[KeywordRow] = []
        for pth in args.csv_paths:
            p = Path(pth)
            if not p.exists():
                print(f"WARN: Missing CSV skipped: {p}", file=sys.stderr)
                continue
            db = load_keywords_from_csv(p)
            all_rows.extend(db.keywords)
        if not all_rows:
            print("ERROR: No valid CSV inputs", file=sys.stderr)
            return 2
        kwdb = KeywordDb(all_rows).unique()
        kwdb.compute_ranking_juice()
        kwdb = _filter_keyword_db(kwdb, args.include, args.exclude)
        prod = ProductInfo(
            brand=args.brand,
            product_line=args.product_line,
            category=args.category,
            attributes=args.attr,
            flavor_or_variant=args.variant,
        )
        # Use aggressive mode if requested
        if args.aggressive:
            print("🏆 AGGRESSIVE MODE: Maximum optimization enabled")
            print("   - Title: 7-9 phrases (vs 3-4)")
            print("   - Repetition: ≤3× per word in title, ≤5× in listing")
            print("   - Strategy: Dash-separated EXACT phrases (no 'with'/'for' connectors)")

        # Use coverage booster if requested
        if args.boost_coverage:
            print(f"📊 COVERAGE BOOSTER: Target {args.coverage_target}% coverage")
            print("   - Generating description with missing keywords")
            print("   - Packing remaining keywords into backend (≤249 bytes)")

        draft = build_listing(
            prod, kwdb,
            aggressive=args.aggressive,
            boost_coverage=args.boost_coverage,
            target_coverage=args.coverage_target
        )

        # Show coverage report
        print(f"\n📊 COVERAGE REPORT:")
        print(f"   Total Keywords: {len(kwdb.keywords)}")
        print(f"   Coverage (top 200): {draft.coverage:.1f}%")
        if draft.coverage >= args.coverage_target:
            print(f"   ✅ Target achieved: {draft.coverage:.1f}% ≥ {args.coverage_target}%")
        else:
            print(f"   ⚠️  Below target: {draft.coverage:.1f}% < {args.coverage_target}%")
        if args.groq:
            api_key = args.groq_key or os.getenv("GROQ_API_KEY", "")
            if not api_key:
                print("WARN: --groq requested but GROQ_API_KEY missing; skipping enhancement", file=sys.stderr)
            else:
                draft = enhance_listing_with_groq(prod, kwdb, draft, api_key=api_key, model=args.groq_model)
        if args.feed_out:
            export_feed_csv(Path(args.feed_out), prod, draft)
        metrics = validate_listing(draft.title, draft.bullets, draft.backend_keywords, aggressive=args.aggressive) if args.validate else None
        if args.json:
            print(json.dumps({
                "title": draft.title,
                "bullets": draft.bullets,
                "backend_keywords": draft.backend_keywords,
                "description": draft.description,
                "coverage": round(draft.coverage, 1),
                **({"validation": metrics} if metrics else {}),
            }, ensure_ascii=False, indent=2))
        else:
            print("\n" + "="*80)
            print("TITLE:\n" + draft.title)
            print(f"Length: {len(draft.title)} chars")
            print("\nBULLETS:")
            for i, b in enumerate(draft.bullets, 1):
                print(f" {i}. {b}")
            print("\nBACKEND KEYWORDS (≤250 bytes):")
            print(draft.backend_keywords)
            print(f"Bytes: {len(draft.backend_keywords.encode('utf-8'))}/250")
            if draft.description:
                print("\nDESCRIPTION:")
                print(draft.description[:500] + "..." if len(draft.description) > 500 else draft.description)
                print(f"Length: {len(draft.description)} chars")
            print("="*80)
            if args.feed_out:
                print(f"\n✅ Feed CSV written: {args.feed_out}")
            if metrics:
                print("\n📋 VALIDATION:")
                print(json.dumps(metrics, ensure_ascii=False, indent=2))
        return 0

    if args.cmd == "proposal":
        subject = f"Amazon Feed Migration + Listing Optimization — Quick Win Plan for {args.client_name}"
        body_lines = [
            f"Hi {args.client_name},",
            "",
            "I reviewed your Amazon setup and prepared a quick, low-risk plan to raise rankings and conversions:",
            "- Fix backend taxonomy (GL → ITK → Product Type) for proper indexing",
            "- Build SEO title + 5 bullets (DataDive roots, anti-stuffing)",
            "- Consolidate variations (parent/child) and upload via flat file",
            "- Optional: A+ modules (brand story, comparison chart)",
        ]
        if args.budget:
            body_lines.append(f"- Budget alignment: {args.budget}")
        if args.project_notes:
            body_lines.append(f"- Notes: {args.project_notes}")
        if args.video_url:
            body_lines.append("")
            body_lines.append(f"Intro video: {args.video_url}")
        body_lines.extend([
            "",
            "If you’d like, I can share a draft listing and a flat-file scaffold in 24h.",
            "",
            "Best,",
            "Shawn",
        ])

        body = "\n".join(body_lines)
        if args.json:
            print(json.dumps({"subject": subject, "body": body}, ensure_ascii=False, indent=2))
        else:
            print("SUBJECT:\n" + subject)
            print("\nBODY:\n" + body)
        return 0

    if args.cmd == "email-bulk-preview":
        path = Path(args.csv_path)
        if not path.exists():
            print(f"ERROR: CSV not found: {path}", file=sys.stderr)
            return 2
        # Flexible mapping of headers
        with path.open("r", encoding="utf-8", errors="ignore") as f:
            dialect = _sniff_csv(f)
            reader = csv.DictReader(f, dialect=dialect)
            out = []
            for i, row in enumerate(reader):
                if i >= args.limit:
                    break
                name = row.get("name") or row.get("client_name") or row.get("full_name") or row.get("first_name") or "there"
                budget = row.get("budget") or row.get("project_budget") or row.get("price") or ""
                notes = row.get("notes") or row.get("description") or row.get("project_description") or row.get("details") or ""
                subject = f"Amazon Feed + Listing Optimization Plan for {name}"
                body = (
                    f"Hi {name},\n\n"
                    "Quick plan to lift your Amazon results:\n"
                    "- Backend taxonomy fix (GL→ITK→Product Type)\n"
                    "- SEO title + 5 bullets (DataDive)\n"
                    "- Parent/child variations via flat file\n"
                    "- Optional A+ content (brand story, comparison chart)\n\n"
                    + (f"Budget context: {budget}\n" if budget else "")
                    + (f"Notes: {notes}\n\n" if notes else "\n")
                    + "Happy to share a draft listing and feed scaffold in 24h.\n\nBest,\nShawn"
                )
                out.append({"name": name, "subject": subject, "body": body})
        if args.json:
            print(json.dumps(out, ensure_ascii=False, indent=2))
        else:
            for i, row in enumerate(out, 1):
                print(f"[{i}] {row['name']}")
                print("SUBJECT:", row["subject"]) 
                print("BODY:\n" + row["body"])
                print("-" * 60)
        return 0

    print("Unknown command", file=sys.stderr)
    return 2


if __name__ == "__main__":
    sys.exit(main())
