fix: Komplette Neukonvertierung — Tabellen und Content endgueltig sauber
Grundproblem: WordPress speicherte HTML mit escaped \n (literal \\n statt Newlines) und inline style-Attributen in Tabellen. node-html-markdown konvertierte diese als Backslash-Artefakte und einzeilige Pipe-Strings. Loesung: Neues final-rebuild.mjs Skript: - \\n -> echte Newlines VOR der Konvertierung - style-Attribute komplett entfernt (verursachten Backslash-Tabellen) - Nav/Footer/SVG per Regex vor dem Parsing entfernt - Tabellen werden jetzt korrekt mehrzeilig mit Header/Separator/Rows gerendert - 44 Guides + 15 Pages verifiziert: 0 Probleme Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
226
scripts/fix-tables-final.py
Normal file
226
scripts/fix-tables-final.py
Normal file
@@ -0,0 +1,226 @@
|
||||
"""
|
||||
Finales Reparatur-Skript:
|
||||
1. Header-Junk (WordPress Hero-Bloecke) entfernen
|
||||
2. Einzeilige Pipe-Tabellen in echte mehrzeilige Markdown-Tabellen umwandeln
|
||||
3. Backslash-Artefakte bereinigen
|
||||
"""
|
||||
import re, os, glob
|
||||
|
||||
DIRS = ["app/src/content/guides", "app/src/content/pages"]
|
||||
|
||||
|
||||
def fix_inline_tables(text):
|
||||
"""Wandelt einzeilige Pipe-Tabellen in mehrzeilige um."""
|
||||
lines = text.split("\n")
|
||||
new_lines = []
|
||||
|
||||
for line in lines:
|
||||
# Pruefen ob die Zeile eine einzeilige Tabelle ist
|
||||
# Muster: | Header1 | Header2 | ... | ------- | ... | Cell1 | Cell2 | ...
|
||||
pipe_count = line.count("|")
|
||||
|
||||
if pipe_count >= 8 and "| ---" in line:
|
||||
# Das ist eine einzeilige Tabelle
|
||||
table_md = parse_inline_table(line)
|
||||
if table_md:
|
||||
new_lines.append(table_md)
|
||||
continue
|
||||
|
||||
new_lines.append(line)
|
||||
|
||||
return "\n".join(new_lines)
|
||||
|
||||
|
||||
def parse_inline_table(line):
|
||||
"""Parst eine einzeilige Pipe-Tabelle und gibt mehrzeilige Markdown-Tabelle zurueck."""
|
||||
# Alle Pipe-separierte Werte extrahieren
|
||||
# Zuerst: Split by | und leere Eintraege filtern
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
parts = [p for p in parts if p] # Leere entfernen
|
||||
|
||||
if not parts:
|
||||
return None
|
||||
|
||||
# Separator-Zeilen finden (nur Striche)
|
||||
sep_indices = []
|
||||
for i, p in enumerate(parts):
|
||||
if re.match(r'^-{3,}$', p):
|
||||
sep_indices.append(i)
|
||||
|
||||
if not sep_indices:
|
||||
return None
|
||||
|
||||
# Die Separatoren teilen die Tabelle in Zeilen
|
||||
# Zuerst: Anzahl der Spalten bestimmen
|
||||
first_sep = sep_indices[0]
|
||||
num_cols = first_sep # Header hat so viele Spalten wie vor dem ersten Separator
|
||||
|
||||
if num_cols < 2:
|
||||
return None
|
||||
|
||||
# Alle Separatoren muessen in Gruppen von num_cols kommen
|
||||
# Header = parts[0:num_cols]
|
||||
# Separator = parts[num_cols:num_cols*2] (alles Striche)
|
||||
# Row 1 = parts[num_cols*2:num_cols*3]
|
||||
# etc.
|
||||
|
||||
rows = []
|
||||
i = 0
|
||||
while i < len(parts):
|
||||
chunk = parts[i:i+num_cols]
|
||||
if len(chunk) == num_cols:
|
||||
# Pruefen ob das eine Separator-Zeile ist
|
||||
if all(re.match(r'^-{3,}$', c) for c in chunk):
|
||||
rows.append("SEP")
|
||||
else:
|
||||
rows.append(chunk)
|
||||
elif len(chunk) > 0:
|
||||
# Auffuellen mit leeren Zellen
|
||||
chunk.extend([""] * (num_cols - len(chunk)))
|
||||
if not all(re.match(r'^-{3,}$', c) for c in chunk if c):
|
||||
rows.append(chunk)
|
||||
i += num_cols
|
||||
|
||||
if len(rows) < 2:
|
||||
return None
|
||||
|
||||
# Spaltenbreiten
|
||||
widths = [0] * num_cols
|
||||
for row in rows:
|
||||
if row == "SEP":
|
||||
continue
|
||||
for j, cell in enumerate(row):
|
||||
if j < num_cols:
|
||||
widths[j] = max(widths[j], len(cell))
|
||||
|
||||
# Mindestbreite
|
||||
widths = [max(w, 3) for w in widths]
|
||||
|
||||
# Markdown-Tabelle bauen
|
||||
md_lines = []
|
||||
header_done = False
|
||||
|
||||
for row in rows:
|
||||
if row == "SEP":
|
||||
if not header_done:
|
||||
md_lines.append("| " + " | ".join("-" * w for w in widths) + " |")
|
||||
header_done = True
|
||||
continue
|
||||
|
||||
cells = []
|
||||
for j in range(num_cols):
|
||||
val = row[j] if j < len(row) else ""
|
||||
cells.append(val.ljust(widths[j]))
|
||||
|
||||
md_lines.append("| " + " | ".join(cells) + " |")
|
||||
|
||||
# Wenn kein Separator gefunden, nach der ersten Zeile einfuegen
|
||||
if not header_done and len(md_lines) >= 1:
|
||||
sep = "| " + " | ".join("-" * w for w in widths) + " |"
|
||||
md_lines.insert(1, sep)
|
||||
|
||||
return "\n".join(md_lines)
|
||||
|
||||
|
||||
def clean_header_junk(body):
|
||||
"""Entfernt WordPress Hero-Bloecke am Anfang des Contents."""
|
||||
# Muster: Zeilen mit vielen Backslashes gefolgt von Hero-Text bis zum ##
|
||||
# z.B.: \\\\\ \ \ \ MOHS... MATERIAL GUIDE\\ ## Titel
|
||||
|
||||
# Finde den ersten echten H1 oder H2
|
||||
lines = body.split("\n")
|
||||
clean_start = 0
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
stripped = line.strip()
|
||||
# Zeilen die mit vielen Backslashes beginnen = Hero-Junk
|
||||
if re.match(r'^\\{2,}', stripped):
|
||||
clean_start = i + 1
|
||||
continue
|
||||
# Zeilen die Hero-Keywords enthalten
|
||||
if any(kw in stripped for kw in ["MATERIAL GUIDE", "SETUP GUIDE", "KALIBRIER",
|
||||
"TROUBLESHOOT", "SLICER GUIDE", "BEGINNER", "EXPERTEN",
|
||||
"MOHS HÄRTE", "FEHLERBILD", "DRUCKPARAMETER"]):
|
||||
clean_start = i + 1
|
||||
continue
|
||||
# Wenn wir einen Header oder normalen Text finden, stoppen
|
||||
if stripped.startswith("#") or (len(stripped) > 20 and not stripped.startswith("\\")):
|
||||
break
|
||||
|
||||
return "\n".join(lines[clean_start:])
|
||||
|
||||
|
||||
def process_file(filepath):
|
||||
with open(filepath) as f:
|
||||
content = f.read()
|
||||
|
||||
original = content
|
||||
|
||||
# Frontmatter schuetzen
|
||||
fm_match = re.match(r'^(---\n[\s\S]*?\n---)\n([\s\S]*)$', content)
|
||||
if not fm_match:
|
||||
return False
|
||||
|
||||
fm = fm_match.group(1)
|
||||
body = fm_match.group(2)
|
||||
|
||||
# 1. Header-Junk entfernen
|
||||
body = clean_header_junk(body)
|
||||
|
||||
# 2. Backslash-Cleanup
|
||||
body = re.sub(r'^\s*\\+\s*$', '', body, flags=re.MULTILINE) # Nur-Backslash-Zeilen
|
||||
body = re.sub(r'^\\ ', '', body, flags=re.MULTILINE) # \ am Zeilenanfang
|
||||
body = re.sub(r'\s*\\$', '', body, flags=re.MULTILINE) # \ am Zeilenende
|
||||
body = re.sub(r'\\~', '~', body) # Escaped Tilde
|
||||
body = re.sub(r'\\(\d+)\.', r'\1.', body) # Escaped Nummern
|
||||
|
||||
# 3. Einzeilige Pipe-Tabellen fixen
|
||||
body = fix_inline_tables(body)
|
||||
|
||||
# 4. Cleanup
|
||||
body = re.sub(r'\n{3,}', '\n\n', body)
|
||||
body = body.strip()
|
||||
|
||||
result = f"{fm}\n\n{body}\n"
|
||||
|
||||
if result != original:
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(result)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# Main
|
||||
print("=== Finales Tabellen- und Content-Cleanup ===\n")
|
||||
|
||||
total = 0
|
||||
for d in DIRS:
|
||||
for fp in sorted(glob.glob(os.path.join(d, "*.md"))):
|
||||
if process_file(fp):
|
||||
print(f" FIXED: {os.path.basename(fp)}")
|
||||
total += 1
|
||||
|
||||
print(f"\n{total} Dateien repariert.\n")
|
||||
|
||||
# Verifikation
|
||||
print("=== Verifikation ===")
|
||||
issues = 0
|
||||
for d in DIRS:
|
||||
for fp in sorted(glob.glob(os.path.join(d, "*.md"))):
|
||||
with open(fp) as f:
|
||||
c = f.read()
|
||||
|
||||
# Einzeilige Tabellen (>8 Pipes + Separator in einer Zeile)
|
||||
inline_tables = 0
|
||||
for line in c.split("\n"):
|
||||
if line.count("|") >= 8 and "| ---" in line:
|
||||
inline_tables += 1
|
||||
|
||||
# Backslash-Zeilen
|
||||
bs = len(re.findall(r'^\s*\\{2,}\s', c, re.MULTILINE))
|
||||
|
||||
if inline_tables > 0 or bs > 0:
|
||||
print(f" {os.path.basename(fp)}: {inline_tables} inline-tables, {bs} backslash-junk")
|
||||
issues += 1
|
||||
|
||||
print(f"\n{'Alle sauber!' if issues == 0 else f'{issues} Dateien mit Problemen'}")
|
||||
Reference in New Issue
Block a user