molzi3d-next/scripts/fix-tables-final.py

"""
Finales Reparatur-Skript:
1. Header-Junk (WordPress Hero-Bloecke) entfernen
2. Einzeilige Pipe-Tabellen in echte mehrzeilige Markdown-Tabellen umwandeln
3. Backslash-Artefakte bereinigen
"""
import re, os, glob

DIRS = ["app/src/content/guides", "app/src/content/pages"]


def fix_inline_tables(text):
    """Wandelt einzeilige Pipe-Tabellen in mehrzeilige um."""
    lines = text.split("\n")
    new_lines = []

    for line in lines:
        # Pruefen ob die Zeile eine einzeilige Tabelle ist
        # Muster: | Header1 | Header2 | ... | ------- | ... | Cell1 | Cell2 | ...
        pipe_count = line.count("|")

        if pipe_count >= 8 and "| ---" in line:
            # Das ist eine einzeilige Tabelle
            table_md = parse_inline_table(line)
            if table_md:
                new_lines.append(table_md)
                continue

        new_lines.append(line)

    return "\n".join(new_lines)


def parse_inline_table(line):
    """Parst eine einzeilige Pipe-Tabelle und gibt mehrzeilige Markdown-Tabelle zurueck."""
    # Alle Pipe-separierte Werte extrahieren
    # Zuerst: Split by | und leere Eintraege filtern
    parts = [p.strip() for p in line.split("|")]
    parts = [p for p in parts if p]  # Leere entfernen

    if not parts:
        return None

    # Separator-Zeilen finden (nur Striche)
    sep_indices = []
    for i, p in enumerate(parts):
        if re.match(r'^-{3,}$', p):
            sep_indices.append(i)

    if not sep_indices:
        return None

    # Die Separatoren teilen die Tabelle in Zeilen
    # Zuerst: Anzahl der Spalten bestimmen
    first_sep = sep_indices[0]
    num_cols = first_sep  # Header hat so viele Spalten wie vor dem ersten Separator

    if num_cols < 2:
        return None

    # Alle Separatoren muessen in Gruppen von num_cols kommen
    # Header = parts[0:num_cols]
    # Separator = parts[num_cols:num_cols*2] (alles Striche)
    # Row 1 = parts[num_cols*2:num_cols*3]
    # etc.

    rows = []
    i = 0
    while i < len(parts):
        chunk = parts[i:i+num_cols]
        if len(chunk) == num_cols:
            # Pruefen ob das eine Separator-Zeile ist
            if all(re.match(r'^-{3,}$', c) for c in chunk):
                rows.append("SEP")
            else:
                rows.append(chunk)
        elif len(chunk) > 0:
            # Auffuellen mit leeren Zellen
            chunk.extend([""] * (num_cols - len(chunk)))
            if not all(re.match(r'^-{3,}$', c) for c in chunk if c):
                rows.append(chunk)
        i += num_cols

    if len(rows) < 2:
        return None

    # Spaltenbreiten
    widths = [0] * num_cols
    for row in rows:
        if row == "SEP":
            continue
        for j, cell in enumerate(row):
            if j < num_cols:
                widths[j] = max(widths[j], len(cell))

    # Mindestbreite
    widths = [max(w, 3) for w in widths]

    # Markdown-Tabelle bauen
    md_lines = []
    header_done = False

    for row in rows:
        if row == "SEP":
            if not header_done:
                md_lines.append("| " + " | ".join("-" * w for w in widths) + " |")
                header_done = True
            continue

        cells = []
        for j in range(num_cols):
            val = row[j] if j < len(row) else ""
            cells.append(val.ljust(widths[j]))

        md_lines.append("| " + " | ".join(cells) + " |")

    # Wenn kein Separator gefunden, nach der ersten Zeile einfuegen
    if not header_done and len(md_lines) >= 1:
        sep = "| " + " | ".join("-" * w for w in widths) + " |"
        md_lines.insert(1, sep)

    return "\n".join(md_lines)


def clean_header_junk(body):
    """Entfernt WordPress Hero-Bloecke am Anfang des Contents."""
    # Muster: Zeilen mit vielen Backslashes gefolgt von Hero-Text bis zum ##
    # z.B.: \\\\\ \ \ \ MOHS... MATERIAL GUIDE\\ ## Titel

    # Finde den ersten echten H1 oder H2
    lines = body.split("\n")
    clean_start = 0

    for i, line in enumerate(lines):
        stripped = line.strip()
        # Zeilen die mit vielen Backslashes beginnen = Hero-Junk
        if re.match(r'^\\{2,}', stripped):
            clean_start = i + 1
            continue
        # Zeilen die Hero-Keywords enthalten
        if any(kw in stripped for kw in ["MATERIAL GUIDE", "SETUP GUIDE", "KALIBRIER",
                "TROUBLESHOOT", "SLICER GUIDE", "BEGINNER", "EXPERTEN",
                "MOHS HÄRTE", "FEHLERBILD", "DRUCKPARAMETER"]):
            clean_start = i + 1
            continue
        # Wenn wir einen Header oder normalen Text finden, stoppen
        if stripped.startswith("#") or (len(stripped) > 20 and not stripped.startswith("\\")):
            break

    return "\n".join(lines[clean_start:])


def process_file(filepath):
    with open(filepath) as f:
        content = f.read()

    original = content

    # Frontmatter schuetzen
    fm_match = re.match(r'^(---\n[\s\S]*?\n---)\n([\s\S]*)$', content)
    if not fm_match:
        return False

    fm = fm_match.group(1)
    body = fm_match.group(2)

    # 1. Header-Junk entfernen
    body = clean_header_junk(body)

    # 2. Backslash-Cleanup
    body = re.sub(r'^\s*\\+\s*$', '', body, flags=re.MULTILINE)  # Nur-Backslash-Zeilen
    body = re.sub(r'^\\ ', '', body, flags=re.MULTILINE)  # \ am Zeilenanfang
    body = re.sub(r'\s*\\$', '', body, flags=re.MULTILINE)  # \ am Zeilenende
    body = re.sub(r'\\~', '~', body)  # Escaped Tilde
    body = re.sub(r'\\(\d+)\.', r'\1.', body)  # Escaped Nummern

    # 3. Einzeilige Pipe-Tabellen fixen
    body = fix_inline_tables(body)

    # 4. Cleanup
    body = re.sub(r'\n{3,}', '\n\n', body)
    body = body.strip()

    result = f"{fm}\n\n{body}\n"

    if result != original:
        with open(filepath, 'w') as f:
            f.write(result)
        return True
    return False


# Main
print("=== Finales Tabellen- und Content-Cleanup ===\n")

total = 0
for d in DIRS:
    for fp in sorted(glob.glob(os.path.join(d, "*.md"))):
        if process_file(fp):
            print(f"  FIXED: {os.path.basename(fp)}")
            total += 1

print(f"\n{total} Dateien repariert.\n")

# Verifikation
print("=== Verifikation ===")
issues = 0
for d in DIRS:
    for fp in sorted(glob.glob(os.path.join(d, "*.md"))):
        with open(fp) as f:
            c = f.read()

        # Einzeilige Tabellen (>8 Pipes + Separator in einer Zeile)
        inline_tables = 0
        for line in c.split("\n"):
            if line.count("|") >= 8 and "| ---" in line:
                inline_tables += 1

        # Backslash-Zeilen
        bs = len(re.findall(r'^\s*\\{2,}\s', c, re.MULTILINE))

        if inline_tables > 0 or bs > 0:
            print(f"  {os.path.basename(fp)}: {inline_tables} inline-tables, {bs} backslash-junk")
            issues += 1

print(f"\n{'Alle sauber!' if issues == 0 else f'{issues} Dateien mit Problemen'}")