Grundproblem: WordPress speicherte HTML mit escaped \n (literal \\n statt Newlines) und inline style-Attributen in Tabellen. node-html-markdown konvertierte diese als Backslash-Artefakte und einzeilige Pipe-Strings. Loesung: Neues final-rebuild.mjs Skript: - \\n -> echte Newlines VOR der Konvertierung - style-Attribute komplett entfernt (verursachten Backslash-Tabellen) - Nav/Footer/SVG per Regex vor dem Parsing entfernt - Tabellen werden jetzt korrekt mehrzeilig mit Header/Separator/Rows gerendert - 44 Guides + 15 Pages verifiziert: 0 Probleme Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
326 lines
11 KiB
Python
326 lines
11 KiB
Python
"""
|
|
Komplette Neukonvertierung aller Guides aus dem Original-HTML-Backup.
|
|
Parst HTML direkt mit BeautifulSoup-aehnlichem Ansatz via html.parser,
|
|
extrahiert Tabellen korrekt und entfernt WordPress-Bloat.
|
|
"""
|
|
import re, os, glob
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
class ContentExtractor(HTMLParser):
|
|
"""Extrahiert strukturierten Content aus WordPress-HTML."""
|
|
|
|
SKIP_TAGS = {"nav", "footer", "button", "style", "script"}
|
|
SKIP_CLASSES = {"v2-nav", "v2-mobile-menu", "v2-footer", "v2-hamburger",
|
|
"v2-nav-links", "v2-mobile-cta", "v2-nav-cta", "hub-section"}
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.output = []
|
|
self.skip_depth = 0
|
|
self.tag_stack = []
|
|
self.in_table = False
|
|
self.table_data = []
|
|
self.current_row = []
|
|
self.current_cell = ""
|
|
self.in_thead = False
|
|
self.cell_is_header = False
|
|
self.in_list = None # "ol" or "ul"
|
|
self.list_counter = 0
|
|
|
|
def _get_class(self, attrs):
|
|
for k, v in attrs:
|
|
if k == "class":
|
|
return v or ""
|
|
return ""
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
cls = self._get_class(attrs)
|
|
|
|
if tag in self.SKIP_TAGS or any(c in cls for c in self.SKIP_CLASSES):
|
|
self.skip_depth += 1
|
|
return
|
|
|
|
if self.skip_depth > 0:
|
|
return
|
|
|
|
self.tag_stack.append(tag)
|
|
|
|
if tag == "table":
|
|
self.in_table = True
|
|
self.table_data = []
|
|
elif tag == "thead":
|
|
self.in_thead = True
|
|
elif tag == "tbody":
|
|
self.in_thead = False
|
|
elif tag == "tr":
|
|
self.current_row = []
|
|
elif tag in ("th", "td"):
|
|
self.current_cell = ""
|
|
self.cell_is_header = tag == "th" or self.in_thead
|
|
elif tag in ("h1", "h2", "h3", "h4"):
|
|
level = int(tag[1])
|
|
self.output.append(f"\n\n{'#' * level} ")
|
|
elif tag == "p" and not self.in_table:
|
|
self.output.append("\n\n")
|
|
elif tag == "strong" or tag == "b":
|
|
self.output.append("**")
|
|
elif tag == "em" or tag == "i":
|
|
self.output.append("*")
|
|
elif tag == "a":
|
|
href = dict(attrs).get("href", "")
|
|
self.output.append(f"[")
|
|
self.tag_stack[-1] = ("a", href)
|
|
elif tag == "br":
|
|
if self.in_table:
|
|
self.current_cell += " "
|
|
else:
|
|
self.output.append("\n")
|
|
elif tag == "ul":
|
|
self.in_list = "ul"
|
|
self.output.append("\n")
|
|
elif tag == "ol":
|
|
self.in_list = "ol"
|
|
self.list_counter = 0
|
|
self.output.append("\n")
|
|
elif tag == "li":
|
|
if self.in_list == "ol":
|
|
self.list_counter += 1
|
|
self.output.append(f"\n{self.list_counter}. ")
|
|
else:
|
|
self.output.append("\n- ")
|
|
elif tag == "code":
|
|
self.output.append("`")
|
|
elif tag == "pre":
|
|
self.output.append("\n```\n")
|
|
elif tag == "blockquote":
|
|
self.output.append("\n> ")
|
|
elif tag == "hr":
|
|
self.output.append("\n\n---\n\n")
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in self.SKIP_TAGS or self.skip_depth > 0:
|
|
if tag in self.SKIP_TAGS:
|
|
self.skip_depth = max(0, self.skip_depth - 1)
|
|
return
|
|
|
|
if tag == "table":
|
|
self.in_table = False
|
|
if self.table_data:
|
|
self.output.append("\n\n" + self._render_table() + "\n\n")
|
|
self.table_data = []
|
|
elif tag == "thead":
|
|
self.in_thead = False
|
|
elif tag == "tr":
|
|
if self.current_row:
|
|
self.table_data.append({
|
|
"cells": self.current_row,
|
|
"header": self.in_thead or (len(self.table_data) == 0 and self.cell_is_header)
|
|
})
|
|
elif tag in ("th", "td"):
|
|
self.current_row.append(self.current_cell.strip())
|
|
elif tag in ("h1", "h2", "h3", "h4"):
|
|
self.output.append("\n\n")
|
|
elif tag == "strong" or tag == "b":
|
|
self.output.append("**")
|
|
elif tag == "em" or tag == "i":
|
|
self.output.append("*")
|
|
elif tag == "a":
|
|
if self.tag_stack and isinstance(self.tag_stack[-1], tuple):
|
|
href = self.tag_stack[-1][1]
|
|
self.output.append(f"]({href})")
|
|
elif tag in ("ul", "ol"):
|
|
self.in_list = None
|
|
self.output.append("\n")
|
|
elif tag == "code":
|
|
self.output.append("`")
|
|
elif tag == "pre":
|
|
self.output.append("\n```\n")
|
|
elif tag == "p" and not self.in_table:
|
|
self.output.append("\n")
|
|
|
|
if self.tag_stack:
|
|
self.tag_stack.pop()
|
|
|
|
def handle_data(self, data):
|
|
if self.skip_depth > 0:
|
|
return
|
|
|
|
text = data
|
|
|
|
if self.in_table and self.current_row is not None:
|
|
self.current_cell += text
|
|
else:
|
|
self.output.append(text)
|
|
|
|
def _render_table(self):
|
|
if not self.table_data:
|
|
return ""
|
|
|
|
headers = [r for r in self.table_data if r["header"]]
|
|
rows = [r for r in self.table_data if not r["header"]]
|
|
|
|
if not headers and rows:
|
|
headers = [rows.pop(0)]
|
|
|
|
if not headers:
|
|
return ""
|
|
|
|
num_cols = len(headers[0]["cells"])
|
|
|
|
# Spaltenbreiten
|
|
widths = [len(h) for h in headers[0]["cells"]]
|
|
for row in rows:
|
|
for i, cell in enumerate(row["cells"][:num_cols]):
|
|
if i < len(widths):
|
|
widths[i] = max(widths[i], len(cell))
|
|
widths = [max(w, 3) for w in widths]
|
|
|
|
lines = []
|
|
# Header
|
|
hcells = headers[0]["cells"][:num_cols]
|
|
lines.append("| " + " | ".join(c.ljust(widths[i]) for i, c in enumerate(hcells)) + " |")
|
|
# Separator
|
|
lines.append("| " + " | ".join("-" * w for w in widths) + " |")
|
|
# Body rows
|
|
for row in rows:
|
|
cells = row["cells"][:num_cols]
|
|
padded = []
|
|
for i in range(num_cols):
|
|
val = cells[i] if i < len(cells) else ""
|
|
padded.append(val.ljust(widths[i]))
|
|
lines.append("| " + " | ".join(padded) + " |")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def get_markdown(self):
|
|
text = "".join(self.output)
|
|
# Cleanup
|
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
text = text.strip()
|
|
return text
|
|
|
|
|
|
# Kategorisierung
|
|
def categorize(slug):
|
|
s = slug.lower()
|
|
if any(k in s for k in ["guide-orcaslicer", "guide-cura", "guide-bambu", "guide-prusaslicer", "slicer"]): return "Slicer"
|
|
if any(k in s for k in ["pla", "petg", "tpu", "asa", "abs", "nylon", "carbon", "resin", "filament", "bed-adhesion"]): return "Materialien"
|
|
if any(k in s for k in ["stringing", "warping", "unterextrusion", "layer-separation", "elefantenfuss", "verstopfte"]): return "Fehlerbehebung"
|
|
if any(k in s for k in ["retraction", "flow-rate", "pressure-advance", "input-shaping", "temperaturturm", "speed-tower", "erste-schicht", "druckbett-leveln"]): return "Kalibrierung"
|
|
if any(k in s for k in ["adaptive", "modifier", "ironing", "fuzzy", "multi-material", "klipper"]): return "Fortgeschritten"
|
|
if any(k in s for k in ["erstes-modell", "support", "infill", "duesenwechsel", "druckzeit", "masshaltigkeit", "bruecken", "nachbearbeiten", "gridfinity", "naht"]): return "Grundlagen"
|
|
return "Allgemein"
|
|
|
|
def difficulty(slug, cat):
|
|
if any(k in slug for k in ["erstes-modell", "erste-schicht", "druckbett-leveln"]): return "einsteiger"
|
|
if cat == "Fortgeschritten" or any(k in slug for k in ["klipper", "pressure-advance", "input-shaping", "carbon", "nylon-pa"]): return "experte"
|
|
return "fortgeschritten"
|
|
|
|
|
|
SKIP_SLUGS = {
|
|
"guide-1-pla-perfekt-einstellen-2026-03-25",
|
|
"guide-2-stringing-reduzieren-2026-03-25",
|
|
"guide-1-warping-vermeiden-2026-03-26",
|
|
"guide-2-petg-ohne-frust-2026-03-26",
|
|
}
|
|
|
|
|
|
def process_html(filepath, out_dir):
|
|
with open(filepath) as f:
|
|
raw = f.read()
|
|
|
|
# Frontmatter extrahieren
|
|
fm_match = re.match(r'^---\n([\s\S]*?)\n---\n([\s\S]*)$', raw)
|
|
if not fm_match:
|
|
return None
|
|
|
|
fm_block = fm_match.group(1)
|
|
html_content = fm_match.group(2).strip()
|
|
|
|
title_m = re.search(r'title:\s*"(.+?)"', fm_block)
|
|
slug_m = re.search(r'slug:\s*"(.+?)"', fm_block)
|
|
excerpt_m = re.search(r'excerpt:\s*"(.*?)"', fm_block)
|
|
|
|
title = title_m.group(1) if title_m else os.path.basename(filepath).replace(".html", "")
|
|
slug = slug_m.group(1) if slug_m else os.path.basename(filepath).replace(".html", "")
|
|
excerpt = excerpt_m.group(1) if excerpt_m else ""
|
|
|
|
if slug in SKIP_SLUGS:
|
|
return None
|
|
|
|
# HTML parsen
|
|
parser = ContentExtractor()
|
|
parser.feed(html_content)
|
|
md = parser.get_markdown()
|
|
|
|
# WordPress-Kommentare entfernen
|
|
md = re.sub(r'<!-- /?wp:\w+ -->', '', md)
|
|
|
|
# Verbleibende Artefakte
|
|
md = re.sub(r'^\s*\\+\s*$', '', md, flags=re.MULTILINE)
|
|
md = re.sub(r'\n{3,}', '\n\n', md)
|
|
md = md.strip()
|
|
|
|
cat = categorize(slug)
|
|
diff = difficulty(slug, cat)
|
|
|
|
frontmatter = f'---\ntitle: "{title}"\nslug: "{slug}"\ncategory: "{cat}"\ndifficulty: "{diff}"\nexcerpt: "{excerpt}"\n---'
|
|
|
|
out_path = os.path.join(out_dir, f"{slug}.md")
|
|
with open(out_path, 'w') as f:
|
|
f.write(f"{frontmatter}\n\n{md}\n")
|
|
|
|
return slug
|
|
|
|
|
|
# Main
|
|
print("=== Komplette Neukonvertierung aus HTML-Backup ===\n")
|
|
|
|
backup_posts = "backup/content/posts"
|
|
backup_pages = "backup/content/pages"
|
|
out_guides = "app/src/content/guides"
|
|
out_pages = "app/src/content/pages"
|
|
|
|
os.makedirs(out_guides, exist_ok=True)
|
|
os.makedirs(out_pages, exist_ok=True)
|
|
|
|
print("Guides:")
|
|
ok = 0
|
|
for f in sorted(glob.glob(os.path.join(backup_posts, "*.html"))):
|
|
result = process_html(f, out_guides)
|
|
if result:
|
|
print(f" OK: {result}")
|
|
ok += 1
|
|
print(f"\n{ok} Guides.\n")
|
|
|
|
print("Pages:")
|
|
pok = 0
|
|
for f in sorted(glob.glob(os.path.join(backup_pages, "*.html"))):
|
|
result = process_html(f, out_pages)
|
|
if result:
|
|
print(f" OK: {result}")
|
|
pok += 1
|
|
print(f"\n{pok} Pages.\n")
|
|
|
|
# Verify
|
|
print("=== Verifikation ===")
|
|
issues = 0
|
|
for d in [out_guides, out_pages]:
|
|
for fp in sorted(glob.glob(os.path.join(d, "*.md"))):
|
|
with open(fp) as f:
|
|
c = f.read()
|
|
# Inline-Tabellen (sollte keine geben)
|
|
inline = sum(1 for line in c.split("\n") if line.count("|") >= 8 and "| ---" in line)
|
|
# Backslash-Muell
|
|
bs = len(re.findall(r'^\s*\\{2,}', c, re.MULTILINE))
|
|
# Tabellen vorhanden?
|
|
tables = c.count("\n| ---")
|
|
if inline > 0 or bs > 0:
|
|
print(f" ISSUE: {os.path.basename(fp)} ({inline} inline, {bs} bs)")
|
|
issues += 1
|
|
elif tables > 0:
|
|
print(f" {os.path.basename(fp)}: {tables} Tabellen OK")
|
|
|
|
print(f"\n{'Alle sauber!' if issues == 0 else f'{issues} Probleme'}")
|