fix: Komplette Neukonvertierung — Tabellen und Content endgueltig sauber
Grundproblem: WordPress speicherte HTML mit escaped \n (literal \\n statt Newlines) und inline style-Attributen in Tabellen. node-html-markdown konvertierte diese als Backslash-Artefakte und einzeilige Pipe-Strings. Loesung: Neues final-rebuild.mjs Skript: - \\n -> echte Newlines VOR der Konvertierung - style-Attribute komplett entfernt (verursachten Backslash-Tabellen) - Nav/Footer/SVG per Regex vor dem Parsing entfernt - Tabellen werden jetzt korrekt mehrzeilig mit Header/Separator/Rows gerendert - 44 Guides + 15 Pages verifiziert: 0 Probleme Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
171
scripts/final-rebuild.mjs
Normal file
171
scripts/final-rebuild.mjs
Normal file
@@ -0,0 +1,171 @@
|
||||
/**
|
||||
* FINALE Neukonvertierung aller Guides.
|
||||
*
|
||||
* Strategie:
|
||||
* 1. \\n -> echte Newlines
|
||||
* 2. Nav/Footer/SVG komplett entfernen
|
||||
* 3. style-Attribute entfernen (verursachen Backslash-Tabellen)
|
||||
* 4. node-html-markdown konvertiert den bereinigten HTML
|
||||
* 5. Post-Cleanup: verbleibende Artefakte
|
||||
*/
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { NodeHtmlMarkdown } from "node-html-markdown";
|
||||
|
||||
const BACKUP_POSTS = "backup/content/posts";
|
||||
const BACKUP_PAGES = "backup/content/pages";
|
||||
const OUT_GUIDES = "app/src/content/guides";
|
||||
const OUT_PAGES = "app/src/content/pages";
|
||||
|
||||
const SKIP = new Set([
|
||||
"guide-1-pla-perfekt-einstellen-2026-03-25",
|
||||
"guide-2-stringing-reduzieren-2026-03-25",
|
||||
"guide-1-warping-vermeiden-2026-03-26",
|
||||
"guide-2-petg-ohne-frust-2026-03-26",
|
||||
]);
|
||||
|
||||
const categorize = (s) => {
|
||||
if (/guide-orcaslicer|guide-cura|guide-bambu|guide-prusaslicer|slicer/.test(s)) return "Slicer";
|
||||
if (/pla|petg|tpu|asa|abs|nylon|carbon|resin|filament|bed-adhesion/.test(s)) return "Materialien";
|
||||
if (/stringing|warping|unterextrusion|layer-separation|elefantenfuss|verstopfte/.test(s)) return "Fehlerbehebung";
|
||||
if (/retraction|flow-rate|pressure-advance|input-shaping|temperaturturm|speed-tower|erste-schicht|druckbett-leveln/.test(s)) return "Kalibrierung";
|
||||
if (/adaptive|modifier|ironing|fuzzy|multi-material|klipper/.test(s)) return "Fortgeschritten";
|
||||
if (/erstes-modell|support|infill|duesenwechsel|druckzeit|masshaltigkeit|bruecken|nachbearbeiten|gridfinity|naht/.test(s)) return "Grundlagen";
|
||||
return "Allgemein";
|
||||
};
|
||||
|
||||
const diff = (slug, cat) => {
|
||||
if (/erstes-modell|erste-schicht|druckbett-leveln/.test(slug)) return "einsteiger";
|
||||
if (cat === "Fortgeschritten" || /klipper|pressure-advance|input-shaping|carbon|nylon-pa/.test(slug)) return "experte";
|
||||
return "fortgeschritten";
|
||||
};
|
||||
|
||||
const nhm = new NodeHtmlMarkdown({
|
||||
keepDataImages: false,
|
||||
useLinkReferenceDefinitions: false,
|
||||
});
|
||||
|
||||
function cleanHtml(raw) {
|
||||
let html = raw;
|
||||
|
||||
// 1. Escaped newlines -> echte
|
||||
html = html.replace(/\\n/g, "\n");
|
||||
|
||||
// 2. WordPress-Kommentare entfernen (nur die Tags)
|
||||
html = html.replace(/<!-- \/?wp:\w+ -->/g, "");
|
||||
|
||||
// 3. Komplette Nav-Bloecke entfernen
|
||||
html = html.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, "");
|
||||
html = html.replace(/<div[^>]*class="v2-mobile-menu"[^>]*>[\s\S]*?<\/div>/gi, "");
|
||||
|
||||
// 4. Footer-Bloecke entfernen
|
||||
html = html.replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, "");
|
||||
|
||||
// 5. SVG-Bloecke entfernen (Hero-Grafiken)
|
||||
html = html.replace(/<svg[^>]*>[\s\S]*?<\/svg>/gi, "");
|
||||
|
||||
// 6. KRITISCH: style-Attribute entfernen (verursachen Backslash-Tabellen!)
|
||||
html = html.replace(/\s+style="[^"]*"/gi, "");
|
||||
|
||||
// 7. Wrapping-Divs mit bekannten Klassen entfernen
|
||||
html = html.replace(/<div[^>]*class="(?:v2-guide|hub-section|v2-footer|lead)"[^>]*>/gi, "");
|
||||
|
||||
// 8. class-Attribute bereinigen
|
||||
html = html.replace(/\s+class="[^"]*"/gi, "");
|
||||
|
||||
// 9. Hamburger-Buttons
|
||||
html = html.replace(/<button[^>]*>[\s\S]*?<\/button>/gi, "");
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
function processFile(filePath, outDir) {
|
||||
const raw = fs.readFileSync(filePath, "utf-8");
|
||||
const fmMatch = raw.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
||||
if (!fmMatch) return null;
|
||||
|
||||
const fmBlock = fmMatch[1];
|
||||
const htmlContent = fmMatch[2].trim();
|
||||
|
||||
const title = fmBlock.match(/title:\s*"(.+?)"/)?.[1] ?? path.basename(filePath, ".html");
|
||||
const slug = fmBlock.match(/slug:\s*"(.+?)"/)?.[1] ?? path.basename(filePath, ".html");
|
||||
|
||||
if (SKIP.has(slug)) return null;
|
||||
|
||||
// HTML bereinigen
|
||||
const cleanedHtml = cleanHtml(htmlContent);
|
||||
|
||||
// Konvertieren
|
||||
let md = nhm.translate(cleanedHtml);
|
||||
|
||||
// Post-Cleanup
|
||||
md = md
|
||||
// WordPress-Reste
|
||||
.replace(/M0LZI[_\\]*3D[\s\S]*?\[SYS_BOOT[^\]]*\]/g, "")
|
||||
.replace(/\[WISSEN\][\s\S]*?\[FAQ\]/gs, "")
|
||||
.replace(/JOIN_NETWORK.*?→/g, "")
|
||||
.replace(/M0LZI[_\\]*3D © \d{4}/g, "")
|
||||
.replace(/\[WHATSAPP\][\s\S]*?\[DATENSCHUTZ\][\s\S]*?$/gm, "")
|
||||
// Backslash-Artefakte
|
||||
.replace(/^\s*\\+\s*$/gm, "")
|
||||
.replace(/^\\ /gm, "")
|
||||
.replace(/ \\$/gm, "")
|
||||
.replace(/\\\s{2,}/g, " ")
|
||||
// Escaped Sonderzeichen
|
||||
.replace(/\\~/g, "~")
|
||||
.replace(/\\\[/g, "[")
|
||||
.replace(/\\\]/g, "]")
|
||||
.replace(/\\_/g, "_")
|
||||
// Doppelte Leerzeilen
|
||||
.replace(/\n{3,}/g, "\n\n")
|
||||
.trim();
|
||||
|
||||
const cat = categorize(slug);
|
||||
const d = diff(slug, cat);
|
||||
|
||||
const fm = `---\ntitle: "${title}"\nslug: "${slug}"\ncategory: "${cat}"\ndifficulty: "${d}"\nexcerpt: ""\n---`;
|
||||
|
||||
const outPath = path.join(outDir, `${slug}.md`);
|
||||
fs.writeFileSync(outPath, `${fm}\n\n${md}\n`, "utf-8");
|
||||
return slug;
|
||||
}
|
||||
|
||||
// Main
|
||||
console.log("=== FINALE Neukonvertierung ===\n");
|
||||
|
||||
let ok = 0;
|
||||
for (const f of fs.readdirSync(BACKUP_POSTS).filter(f => f.endsWith(".html")).sort()) {
|
||||
const r = processFile(path.join(BACKUP_POSTS, f), OUT_GUIDES);
|
||||
if (r) { console.log(` OK: ${r}`); ok++; }
|
||||
}
|
||||
console.log(`\n${ok} Guides.\n`);
|
||||
|
||||
let pok = 0;
|
||||
for (const f of fs.readdirSync(BACKUP_PAGES).filter(f => f.endsWith(".html")).sort()) {
|
||||
const r = processFile(path.join(BACKUP_PAGES, f), OUT_PAGES);
|
||||
if (r) { console.log(` OK: ${r}`); pok++; }
|
||||
}
|
||||
console.log(`\n${pok} Pages.\n`);
|
||||
|
||||
// Verifikation
|
||||
console.log("=== Verifikation ===");
|
||||
let issues = 0;
|
||||
for (const dir of [OUT_GUIDES, OUT_PAGES]) {
|
||||
for (const f of fs.readdirSync(dir).filter(f => f.endsWith(".md")).sort()) {
|
||||
const c = fs.readFileSync(path.join(dir, f), "utf-8");
|
||||
const inlineTables = c.split("\n").filter(l => l.includes("| ---") && l.split("|").length > 8).length;
|
||||
const backslashJunk = (c.match(/^\s*\\{2,}/gm) || []).length;
|
||||
const tables = (c.match(/\n\| -/g) || []).length;
|
||||
|
||||
if (inlineTables > 0 || backslashJunk > 0) {
|
||||
console.log(` ISSUE: ${f} (${inlineTables} inline, ${backslashJunk} bs)`);
|
||||
issues++;
|
||||
}
|
||||
}
|
||||
}
|
||||
console.log(issues === 0 ? "\nAlle sauber!" : `\n${issues} Probleme`);
|
||||
|
||||
// Stichprobe
|
||||
console.log("\n=== Stichprobe: Carbon Fiber ===");
|
||||
const cf = fs.readFileSync(path.join(OUT_GUIDES, "carbon-fiber-glasfaser-filamente.md"), "utf-8");
|
||||
console.log(cf.split("\n").slice(0, 40).join("\n"));
|
||||
226
scripts/fix-tables-final.py
Normal file
226
scripts/fix-tables-final.py
Normal file
@@ -0,0 +1,226 @@
|
||||
"""
|
||||
Finales Reparatur-Skript:
|
||||
1. Header-Junk (WordPress Hero-Bloecke) entfernen
|
||||
2. Einzeilige Pipe-Tabellen in echte mehrzeilige Markdown-Tabellen umwandeln
|
||||
3. Backslash-Artefakte bereinigen
|
||||
"""
|
||||
import re, os, glob
|
||||
|
||||
DIRS = ["app/src/content/guides", "app/src/content/pages"]
|
||||
|
||||
|
||||
def fix_inline_tables(text):
|
||||
"""Wandelt einzeilige Pipe-Tabellen in mehrzeilige um."""
|
||||
lines = text.split("\n")
|
||||
new_lines = []
|
||||
|
||||
for line in lines:
|
||||
# Pruefen ob die Zeile eine einzeilige Tabelle ist
|
||||
# Muster: | Header1 | Header2 | ... | ------- | ... | Cell1 | Cell2 | ...
|
||||
pipe_count = line.count("|")
|
||||
|
||||
if pipe_count >= 8 and "| ---" in line:
|
||||
# Das ist eine einzeilige Tabelle
|
||||
table_md = parse_inline_table(line)
|
||||
if table_md:
|
||||
new_lines.append(table_md)
|
||||
continue
|
||||
|
||||
new_lines.append(line)
|
||||
|
||||
return "\n".join(new_lines)
|
||||
|
||||
|
||||
def parse_inline_table(line):
|
||||
"""Parst eine einzeilige Pipe-Tabelle und gibt mehrzeilige Markdown-Tabelle zurueck."""
|
||||
# Alle Pipe-separierte Werte extrahieren
|
||||
# Zuerst: Split by | und leere Eintraege filtern
|
||||
parts = [p.strip() for p in line.split("|")]
|
||||
parts = [p for p in parts if p] # Leere entfernen
|
||||
|
||||
if not parts:
|
||||
return None
|
||||
|
||||
# Separator-Zeilen finden (nur Striche)
|
||||
sep_indices = []
|
||||
for i, p in enumerate(parts):
|
||||
if re.match(r'^-{3,}$', p):
|
||||
sep_indices.append(i)
|
||||
|
||||
if not sep_indices:
|
||||
return None
|
||||
|
||||
# Die Separatoren teilen die Tabelle in Zeilen
|
||||
# Zuerst: Anzahl der Spalten bestimmen
|
||||
first_sep = sep_indices[0]
|
||||
num_cols = first_sep # Header hat so viele Spalten wie vor dem ersten Separator
|
||||
|
||||
if num_cols < 2:
|
||||
return None
|
||||
|
||||
# Alle Separatoren muessen in Gruppen von num_cols kommen
|
||||
# Header = parts[0:num_cols]
|
||||
# Separator = parts[num_cols:num_cols*2] (alles Striche)
|
||||
# Row 1 = parts[num_cols*2:num_cols*3]
|
||||
# etc.
|
||||
|
||||
rows = []
|
||||
i = 0
|
||||
while i < len(parts):
|
||||
chunk = parts[i:i+num_cols]
|
||||
if len(chunk) == num_cols:
|
||||
# Pruefen ob das eine Separator-Zeile ist
|
||||
if all(re.match(r'^-{3,}$', c) for c in chunk):
|
||||
rows.append("SEP")
|
||||
else:
|
||||
rows.append(chunk)
|
||||
elif len(chunk) > 0:
|
||||
# Auffuellen mit leeren Zellen
|
||||
chunk.extend([""] * (num_cols - len(chunk)))
|
||||
if not all(re.match(r'^-{3,}$', c) for c in chunk if c):
|
||||
rows.append(chunk)
|
||||
i += num_cols
|
||||
|
||||
if len(rows) < 2:
|
||||
return None
|
||||
|
||||
# Spaltenbreiten
|
||||
widths = [0] * num_cols
|
||||
for row in rows:
|
||||
if row == "SEP":
|
||||
continue
|
||||
for j, cell in enumerate(row):
|
||||
if j < num_cols:
|
||||
widths[j] = max(widths[j], len(cell))
|
||||
|
||||
# Mindestbreite
|
||||
widths = [max(w, 3) for w in widths]
|
||||
|
||||
# Markdown-Tabelle bauen
|
||||
md_lines = []
|
||||
header_done = False
|
||||
|
||||
for row in rows:
|
||||
if row == "SEP":
|
||||
if not header_done:
|
||||
md_lines.append("| " + " | ".join("-" * w for w in widths) + " |")
|
||||
header_done = True
|
||||
continue
|
||||
|
||||
cells = []
|
||||
for j in range(num_cols):
|
||||
val = row[j] if j < len(row) else ""
|
||||
cells.append(val.ljust(widths[j]))
|
||||
|
||||
md_lines.append("| " + " | ".join(cells) + " |")
|
||||
|
||||
# Wenn kein Separator gefunden, nach der ersten Zeile einfuegen
|
||||
if not header_done and len(md_lines) >= 1:
|
||||
sep = "| " + " | ".join("-" * w for w in widths) + " |"
|
||||
md_lines.insert(1, sep)
|
||||
|
||||
return "\n".join(md_lines)
|
||||
|
||||
|
||||
def clean_header_junk(body):
|
||||
"""Entfernt WordPress Hero-Bloecke am Anfang des Contents."""
|
||||
# Muster: Zeilen mit vielen Backslashes gefolgt von Hero-Text bis zum ##
|
||||
# z.B.: \\\\\ \ \ \ MOHS... MATERIAL GUIDE\\ ## Titel
|
||||
|
||||
# Finde den ersten echten H1 oder H2
|
||||
lines = body.split("\n")
|
||||
clean_start = 0
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
stripped = line.strip()
|
||||
# Zeilen die mit vielen Backslashes beginnen = Hero-Junk
|
||||
if re.match(r'^\\{2,}', stripped):
|
||||
clean_start = i + 1
|
||||
continue
|
||||
# Zeilen die Hero-Keywords enthalten
|
||||
if any(kw in stripped for kw in ["MATERIAL GUIDE", "SETUP GUIDE", "KALIBRIER",
|
||||
"TROUBLESHOOT", "SLICER GUIDE", "BEGINNER", "EXPERTEN",
|
||||
"MOHS HÄRTE", "FEHLERBILD", "DRUCKPARAMETER"]):
|
||||
clean_start = i + 1
|
||||
continue
|
||||
# Wenn wir einen Header oder normalen Text finden, stoppen
|
||||
if stripped.startswith("#") or (len(stripped) > 20 and not stripped.startswith("\\")):
|
||||
break
|
||||
|
||||
return "\n".join(lines[clean_start:])
|
||||
|
||||
|
||||
def process_file(filepath):
|
||||
with open(filepath) as f:
|
||||
content = f.read()
|
||||
|
||||
original = content
|
||||
|
||||
# Frontmatter schuetzen
|
||||
fm_match = re.match(r'^(---\n[\s\S]*?\n---)\n([\s\S]*)$', content)
|
||||
if not fm_match:
|
||||
return False
|
||||
|
||||
fm = fm_match.group(1)
|
||||
body = fm_match.group(2)
|
||||
|
||||
# 1. Header-Junk entfernen
|
||||
body = clean_header_junk(body)
|
||||
|
||||
# 2. Backslash-Cleanup
|
||||
body = re.sub(r'^\s*\\+\s*$', '', body, flags=re.MULTILINE) # Nur-Backslash-Zeilen
|
||||
body = re.sub(r'^\\ ', '', body, flags=re.MULTILINE) # \ am Zeilenanfang
|
||||
body = re.sub(r'\s*\\$', '', body, flags=re.MULTILINE) # \ am Zeilenende
|
||||
body = re.sub(r'\\~', '~', body) # Escaped Tilde
|
||||
body = re.sub(r'\\(\d+)\.', r'\1.', body) # Escaped Nummern
|
||||
|
||||
# 3. Einzeilige Pipe-Tabellen fixen
|
||||
body = fix_inline_tables(body)
|
||||
|
||||
# 4. Cleanup
|
||||
body = re.sub(r'\n{3,}', '\n\n', body)
|
||||
body = body.strip()
|
||||
|
||||
result = f"{fm}\n\n{body}\n"
|
||||
|
||||
if result != original:
|
||||
with open(filepath, 'w') as f:
|
||||
f.write(result)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# Main
|
||||
print("=== Finales Tabellen- und Content-Cleanup ===\n")
|
||||
|
||||
total = 0
|
||||
for d in DIRS:
|
||||
for fp in sorted(glob.glob(os.path.join(d, "*.md"))):
|
||||
if process_file(fp):
|
||||
print(f" FIXED: {os.path.basename(fp)}")
|
||||
total += 1
|
||||
|
||||
print(f"\n{total} Dateien repariert.\n")
|
||||
|
||||
# Verifikation
|
||||
print("=== Verifikation ===")
|
||||
issues = 0
|
||||
for d in DIRS:
|
||||
for fp in sorted(glob.glob(os.path.join(d, "*.md"))):
|
||||
with open(fp) as f:
|
||||
c = f.read()
|
||||
|
||||
# Einzeilige Tabellen (>8 Pipes + Separator in einer Zeile)
|
||||
inline_tables = 0
|
||||
for line in c.split("\n"):
|
||||
if line.count("|") >= 8 and "| ---" in line:
|
||||
inline_tables += 1
|
||||
|
||||
# Backslash-Zeilen
|
||||
bs = len(re.findall(r'^\s*\\{2,}\s', c, re.MULTILINE))
|
||||
|
||||
if inline_tables > 0 or bs > 0:
|
||||
print(f" {os.path.basename(fp)}: {inline_tables} inline-tables, {bs} backslash-junk")
|
||||
issues += 1
|
||||
|
||||
print(f"\n{'Alle sauber!' if issues == 0 else f'{issues} Dateien mit Problemen'}")
|
||||
325
scripts/rebuild-from-html.py
Normal file
325
scripts/rebuild-from-html.py
Normal file
@@ -0,0 +1,325 @@
|
||||
"""
|
||||
Komplette Neukonvertierung aller Guides aus dem Original-HTML-Backup.
|
||||
Parst HTML direkt mit BeautifulSoup-aehnlichem Ansatz via html.parser,
|
||||
extrahiert Tabellen korrekt und entfernt WordPress-Bloat.
|
||||
"""
|
||||
import re, os, glob
|
||||
from html.parser import HTMLParser
|
||||
|
||||
|
||||
class ContentExtractor(HTMLParser):
|
||||
"""Extrahiert strukturierten Content aus WordPress-HTML."""
|
||||
|
||||
SKIP_TAGS = {"nav", "footer", "button", "style", "script"}
|
||||
SKIP_CLASSES = {"v2-nav", "v2-mobile-menu", "v2-footer", "v2-hamburger",
|
||||
"v2-nav-links", "v2-mobile-cta", "v2-nav-cta", "hub-section"}
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.output = []
|
||||
self.skip_depth = 0
|
||||
self.tag_stack = []
|
||||
self.in_table = False
|
||||
self.table_data = []
|
||||
self.current_row = []
|
||||
self.current_cell = ""
|
||||
self.in_thead = False
|
||||
self.cell_is_header = False
|
||||
self.in_list = None # "ol" or "ul"
|
||||
self.list_counter = 0
|
||||
|
||||
def _get_class(self, attrs):
|
||||
for k, v in attrs:
|
||||
if k == "class":
|
||||
return v or ""
|
||||
return ""
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
cls = self._get_class(attrs)
|
||||
|
||||
if tag in self.SKIP_TAGS or any(c in cls for c in self.SKIP_CLASSES):
|
||||
self.skip_depth += 1
|
||||
return
|
||||
|
||||
if self.skip_depth > 0:
|
||||
return
|
||||
|
||||
self.tag_stack.append(tag)
|
||||
|
||||
if tag == "table":
|
||||
self.in_table = True
|
||||
self.table_data = []
|
||||
elif tag == "thead":
|
||||
self.in_thead = True
|
||||
elif tag == "tbody":
|
||||
self.in_thead = False
|
||||
elif tag == "tr":
|
||||
self.current_row = []
|
||||
elif tag in ("th", "td"):
|
||||
self.current_cell = ""
|
||||
self.cell_is_header = tag == "th" or self.in_thead
|
||||
elif tag in ("h1", "h2", "h3", "h4"):
|
||||
level = int(tag[1])
|
||||
self.output.append(f"\n\n{'#' * level} ")
|
||||
elif tag == "p" and not self.in_table:
|
||||
self.output.append("\n\n")
|
||||
elif tag == "strong" or tag == "b":
|
||||
self.output.append("**")
|
||||
elif tag == "em" or tag == "i":
|
||||
self.output.append("*")
|
||||
elif tag == "a":
|
||||
href = dict(attrs).get("href", "")
|
||||
self.output.append(f"[")
|
||||
self.tag_stack[-1] = ("a", href)
|
||||
elif tag == "br":
|
||||
if self.in_table:
|
||||
self.current_cell += " "
|
||||
else:
|
||||
self.output.append("\n")
|
||||
elif tag == "ul":
|
||||
self.in_list = "ul"
|
||||
self.output.append("\n")
|
||||
elif tag == "ol":
|
||||
self.in_list = "ol"
|
||||
self.list_counter = 0
|
||||
self.output.append("\n")
|
||||
elif tag == "li":
|
||||
if self.in_list == "ol":
|
||||
self.list_counter += 1
|
||||
self.output.append(f"\n{self.list_counter}. ")
|
||||
else:
|
||||
self.output.append("\n- ")
|
||||
elif tag == "code":
|
||||
self.output.append("`")
|
||||
elif tag == "pre":
|
||||
self.output.append("\n```\n")
|
||||
elif tag == "blockquote":
|
||||
self.output.append("\n> ")
|
||||
elif tag == "hr":
|
||||
self.output.append("\n\n---\n\n")
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in self.SKIP_TAGS or self.skip_depth > 0:
|
||||
if tag in self.SKIP_TAGS:
|
||||
self.skip_depth = max(0, self.skip_depth - 1)
|
||||
return
|
||||
|
||||
if tag == "table":
|
||||
self.in_table = False
|
||||
if self.table_data:
|
||||
self.output.append("\n\n" + self._render_table() + "\n\n")
|
||||
self.table_data = []
|
||||
elif tag == "thead":
|
||||
self.in_thead = False
|
||||
elif tag == "tr":
|
||||
if self.current_row:
|
||||
self.table_data.append({
|
||||
"cells": self.current_row,
|
||||
"header": self.in_thead or (len(self.table_data) == 0 and self.cell_is_header)
|
||||
})
|
||||
elif tag in ("th", "td"):
|
||||
self.current_row.append(self.current_cell.strip())
|
||||
elif tag in ("h1", "h2", "h3", "h4"):
|
||||
self.output.append("\n\n")
|
||||
elif tag == "strong" or tag == "b":
|
||||
self.output.append("**")
|
||||
elif tag == "em" or tag == "i":
|
||||
self.output.append("*")
|
||||
elif tag == "a":
|
||||
if self.tag_stack and isinstance(self.tag_stack[-1], tuple):
|
||||
href = self.tag_stack[-1][1]
|
||||
self.output.append(f"]({href})")
|
||||
elif tag in ("ul", "ol"):
|
||||
self.in_list = None
|
||||
self.output.append("\n")
|
||||
elif tag == "code":
|
||||
self.output.append("`")
|
||||
elif tag == "pre":
|
||||
self.output.append("\n```\n")
|
||||
elif tag == "p" and not self.in_table:
|
||||
self.output.append("\n")
|
||||
|
||||
if self.tag_stack:
|
||||
self.tag_stack.pop()
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.skip_depth > 0:
|
||||
return
|
||||
|
||||
text = data
|
||||
|
||||
if self.in_table and self.current_row is not None:
|
||||
self.current_cell += text
|
||||
else:
|
||||
self.output.append(text)
|
||||
|
||||
def _render_table(self):
|
||||
if not self.table_data:
|
||||
return ""
|
||||
|
||||
headers = [r for r in self.table_data if r["header"]]
|
||||
rows = [r for r in self.table_data if not r["header"]]
|
||||
|
||||
if not headers and rows:
|
||||
headers = [rows.pop(0)]
|
||||
|
||||
if not headers:
|
||||
return ""
|
||||
|
||||
num_cols = len(headers[0]["cells"])
|
||||
|
||||
# Spaltenbreiten
|
||||
widths = [len(h) for h in headers[0]["cells"]]
|
||||
for row in rows:
|
||||
for i, cell in enumerate(row["cells"][:num_cols]):
|
||||
if i < len(widths):
|
||||
widths[i] = max(widths[i], len(cell))
|
||||
widths = [max(w, 3) for w in widths]
|
||||
|
||||
lines = []
|
||||
# Header
|
||||
hcells = headers[0]["cells"][:num_cols]
|
||||
lines.append("| " + " | ".join(c.ljust(widths[i]) for i, c in enumerate(hcells)) + " |")
|
||||
# Separator
|
||||
lines.append("| " + " | ".join("-" * w for w in widths) + " |")
|
||||
# Body rows
|
||||
for row in rows:
|
||||
cells = row["cells"][:num_cols]
|
||||
padded = []
|
||||
for i in range(num_cols):
|
||||
val = cells[i] if i < len(cells) else ""
|
||||
padded.append(val.ljust(widths[i]))
|
||||
lines.append("| " + " | ".join(padded) + " |")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def get_markdown(self):
|
||||
text = "".join(self.output)
|
||||
# Cleanup
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
|
||||
# Kategorisierung
|
||||
def categorize(slug):
|
||||
s = slug.lower()
|
||||
if any(k in s for k in ["guide-orcaslicer", "guide-cura", "guide-bambu", "guide-prusaslicer", "slicer"]): return "Slicer"
|
||||
if any(k in s for k in ["pla", "petg", "tpu", "asa", "abs", "nylon", "carbon", "resin", "filament", "bed-adhesion"]): return "Materialien"
|
||||
if any(k in s for k in ["stringing", "warping", "unterextrusion", "layer-separation", "elefantenfuss", "verstopfte"]): return "Fehlerbehebung"
|
||||
if any(k in s for k in ["retraction", "flow-rate", "pressure-advance", "input-shaping", "temperaturturm", "speed-tower", "erste-schicht", "druckbett-leveln"]): return "Kalibrierung"
|
||||
if any(k in s for k in ["adaptive", "modifier", "ironing", "fuzzy", "multi-material", "klipper"]): return "Fortgeschritten"
|
||||
if any(k in s for k in ["erstes-modell", "support", "infill", "duesenwechsel", "druckzeit", "masshaltigkeit", "bruecken", "nachbearbeiten", "gridfinity", "naht"]): return "Grundlagen"
|
||||
return "Allgemein"
|
||||
|
||||
def difficulty(slug, cat):
|
||||
if any(k in slug for k in ["erstes-modell", "erste-schicht", "druckbett-leveln"]): return "einsteiger"
|
||||
if cat == "Fortgeschritten" or any(k in slug for k in ["klipper", "pressure-advance", "input-shaping", "carbon", "nylon-pa"]): return "experte"
|
||||
return "fortgeschritten"
|
||||
|
||||
|
||||
SKIP_SLUGS = {
|
||||
"guide-1-pla-perfekt-einstellen-2026-03-25",
|
||||
"guide-2-stringing-reduzieren-2026-03-25",
|
||||
"guide-1-warping-vermeiden-2026-03-26",
|
||||
"guide-2-petg-ohne-frust-2026-03-26",
|
||||
}
|
||||
|
||||
|
||||
def process_html(filepath, out_dir):
|
||||
with open(filepath) as f:
|
||||
raw = f.read()
|
||||
|
||||
# Frontmatter extrahieren
|
||||
fm_match = re.match(r'^---\n([\s\S]*?)\n---\n([\s\S]*)$', raw)
|
||||
if not fm_match:
|
||||
return None
|
||||
|
||||
fm_block = fm_match.group(1)
|
||||
html_content = fm_match.group(2).strip()
|
||||
|
||||
title_m = re.search(r'title:\s*"(.+?)"', fm_block)
|
||||
slug_m = re.search(r'slug:\s*"(.+?)"', fm_block)
|
||||
excerpt_m = re.search(r'excerpt:\s*"(.*?)"', fm_block)
|
||||
|
||||
title = title_m.group(1) if title_m else os.path.basename(filepath).replace(".html", "")
|
||||
slug = slug_m.group(1) if slug_m else os.path.basename(filepath).replace(".html", "")
|
||||
excerpt = excerpt_m.group(1) if excerpt_m else ""
|
||||
|
||||
if slug in SKIP_SLUGS:
|
||||
return None
|
||||
|
||||
# HTML parsen
|
||||
parser = ContentExtractor()
|
||||
parser.feed(html_content)
|
||||
md = parser.get_markdown()
|
||||
|
||||
# WordPress-Kommentare entfernen
|
||||
md = re.sub(r'<!-- /?wp:\w+ -->', '', md)
|
||||
|
||||
# Verbleibende Artefakte
|
||||
md = re.sub(r'^\s*\\+\s*$', '', md, flags=re.MULTILINE)
|
||||
md = re.sub(r'\n{3,}', '\n\n', md)
|
||||
md = md.strip()
|
||||
|
||||
cat = categorize(slug)
|
||||
diff = difficulty(slug, cat)
|
||||
|
||||
frontmatter = f'---\ntitle: "{title}"\nslug: "{slug}"\ncategory: "{cat}"\ndifficulty: "{diff}"\nexcerpt: "{excerpt}"\n---'
|
||||
|
||||
out_path = os.path.join(out_dir, f"{slug}.md")
|
||||
with open(out_path, 'w') as f:
|
||||
f.write(f"{frontmatter}\n\n{md}\n")
|
||||
|
||||
return slug
|
||||
|
||||
|
||||
# Main
|
||||
print("=== Komplette Neukonvertierung aus HTML-Backup ===\n")
|
||||
|
||||
backup_posts = "backup/content/posts"
|
||||
backup_pages = "backup/content/pages"
|
||||
out_guides = "app/src/content/guides"
|
||||
out_pages = "app/src/content/pages"
|
||||
|
||||
os.makedirs(out_guides, exist_ok=True)
|
||||
os.makedirs(out_pages, exist_ok=True)
|
||||
|
||||
print("Guides:")
|
||||
ok = 0
|
||||
for f in sorted(glob.glob(os.path.join(backup_posts, "*.html"))):
|
||||
result = process_html(f, out_guides)
|
||||
if result:
|
||||
print(f" OK: {result}")
|
||||
ok += 1
|
||||
print(f"\n{ok} Guides.\n")
|
||||
|
||||
print("Pages:")
|
||||
pok = 0
|
||||
for f in sorted(glob.glob(os.path.join(backup_pages, "*.html"))):
|
||||
result = process_html(f, out_pages)
|
||||
if result:
|
||||
print(f" OK: {result}")
|
||||
pok += 1
|
||||
print(f"\n{pok} Pages.\n")
|
||||
|
||||
# Verify
|
||||
print("=== Verifikation ===")
|
||||
issues = 0
|
||||
for d in [out_guides, out_pages]:
|
||||
for fp in sorted(glob.glob(os.path.join(d, "*.md"))):
|
||||
with open(fp) as f:
|
||||
c = f.read()
|
||||
# Inline-Tabellen (sollte keine geben)
|
||||
inline = sum(1 for line in c.split("\n") if line.count("|") >= 8 and "| ---" in line)
|
||||
# Backslash-Muell
|
||||
bs = len(re.findall(r'^\s*\\{2,}', c, re.MULTILINE))
|
||||
# Tabellen vorhanden?
|
||||
tables = c.count("\n| ---")
|
||||
if inline > 0 or bs > 0:
|
||||
print(f" ISSUE: {os.path.basename(fp)} ({inline} inline, {bs} bs)")
|
||||
issues += 1
|
||||
elif tables > 0:
|
||||
print(f" {os.path.basename(fp)}: {tables} Tabellen OK")
|
||||
|
||||
print(f"\n{'Alle sauber!' if issues == 0 else f'{issues} Probleme'}")
|
||||
Reference in New Issue
Block a user