""" Komplette Neukonvertierung aller Guides aus dem Original-HTML-Backup. Parst HTML direkt mit BeautifulSoup-aehnlichem Ansatz via html.parser, extrahiert Tabellen korrekt und entfernt WordPress-Bloat. """ import re, os, glob from html.parser import HTMLParser class ContentExtractor(HTMLParser): """Extrahiert strukturierten Content aus WordPress-HTML.""" SKIP_TAGS = {"nav", "footer", "button", "style", "script"} SKIP_CLASSES = {"v2-nav", "v2-mobile-menu", "v2-footer", "v2-hamburger", "v2-nav-links", "v2-mobile-cta", "v2-nav-cta", "hub-section"} def __init__(self): super().__init__() self.output = [] self.skip_depth = 0 self.tag_stack = [] self.in_table = False self.table_data = [] self.current_row = [] self.current_cell = "" self.in_thead = False self.cell_is_header = False self.in_list = None # "ol" or "ul" self.list_counter = 0 def _get_class(self, attrs): for k, v in attrs: if k == "class": return v or "" return "" def handle_starttag(self, tag, attrs): cls = self._get_class(attrs) if tag in self.SKIP_TAGS or any(c in cls for c in self.SKIP_CLASSES): self.skip_depth += 1 return if self.skip_depth > 0: return self.tag_stack.append(tag) if tag == "table": self.in_table = True self.table_data = [] elif tag == "thead": self.in_thead = True elif tag == "tbody": self.in_thead = False elif tag == "tr": self.current_row = [] elif tag in ("th", "td"): self.current_cell = "" self.cell_is_header = tag == "th" or self.in_thead elif tag in ("h1", "h2", "h3", "h4"): level = int(tag[1]) self.output.append(f"\n\n{'#' * level} ") elif tag == "p" and not self.in_table: self.output.append("\n\n") elif tag == "strong" or tag == "b": self.output.append("**") elif tag == "em" or tag == "i": self.output.append("*") elif tag == "a": href = dict(attrs).get("href", "") self.output.append(f"[") self.tag_stack[-1] = ("a", href) elif tag == "br": if self.in_table: self.current_cell += " " else: self.output.append("\n") elif tag == "ul": self.in_list = "ul" self.output.append("\n") elif tag == "ol": self.in_list = "ol" self.list_counter = 0 self.output.append("\n") elif tag == "li": if self.in_list == "ol": self.list_counter += 1 self.output.append(f"\n{self.list_counter}. ") else: self.output.append("\n- ") elif tag == "code": self.output.append("`") elif tag == "pre": self.output.append("\n```\n") elif tag == "blockquote": self.output.append("\n> ") elif tag == "hr": self.output.append("\n\n---\n\n") def handle_endtag(self, tag): if tag in self.SKIP_TAGS or self.skip_depth > 0: if tag in self.SKIP_TAGS: self.skip_depth = max(0, self.skip_depth - 1) return if tag == "table": self.in_table = False if self.table_data: self.output.append("\n\n" + self._render_table() + "\n\n") self.table_data = [] elif tag == "thead": self.in_thead = False elif tag == "tr": if self.current_row: self.table_data.append({ "cells": self.current_row, "header": self.in_thead or (len(self.table_data) == 0 and self.cell_is_header) }) elif tag in ("th", "td"): self.current_row.append(self.current_cell.strip()) elif tag in ("h1", "h2", "h3", "h4"): self.output.append("\n\n") elif tag == "strong" or tag == "b": self.output.append("**") elif tag == "em" or tag == "i": self.output.append("*") elif tag == "a": if self.tag_stack and isinstance(self.tag_stack[-1], tuple): href = self.tag_stack[-1][1] self.output.append(f"]({href})") elif tag in ("ul", "ol"): self.in_list = None self.output.append("\n") elif tag == "code": self.output.append("`") elif tag == "pre": self.output.append("\n```\n") elif tag == "p" and not self.in_table: self.output.append("\n") if self.tag_stack: self.tag_stack.pop() def handle_data(self, data): if self.skip_depth > 0: return text = data if self.in_table and self.current_row is not None: self.current_cell += text else: self.output.append(text) def _render_table(self): if not self.table_data: return "" headers = [r for r in self.table_data if r["header"]] rows = [r for r in self.table_data if not r["header"]] if not headers and rows: headers = [rows.pop(0)] if not headers: return "" num_cols = len(headers[0]["cells"]) # Spaltenbreiten widths = [len(h) for h in headers[0]["cells"]] for row in rows: for i, cell in enumerate(row["cells"][:num_cols]): if i < len(widths): widths[i] = max(widths[i], len(cell)) widths = [max(w, 3) for w in widths] lines = [] # Header hcells = headers[0]["cells"][:num_cols] lines.append("| " + " | ".join(c.ljust(widths[i]) for i, c in enumerate(hcells)) + " |") # Separator lines.append("| " + " | ".join("-" * w for w in widths) + " |") # Body rows for row in rows: cells = row["cells"][:num_cols] padded = [] for i in range(num_cols): val = cells[i] if i < len(cells) else "" padded.append(val.ljust(widths[i])) lines.append("| " + " | ".join(padded) + " |") return "\n".join(lines) def get_markdown(self): text = "".join(self.output) # Cleanup text = re.sub(r'\n{3,}', '\n\n', text) text = text.strip() return text # Kategorisierung def categorize(slug): s = slug.lower() if any(k in s for k in ["guide-orcaslicer", "guide-cura", "guide-bambu", "guide-prusaslicer", "slicer"]): return "Slicer" if any(k in s for k in ["pla", "petg", "tpu", "asa", "abs", "nylon", "carbon", "resin", "filament", "bed-adhesion"]): return "Materialien" if any(k in s for k in ["stringing", "warping", "unterextrusion", "layer-separation", "elefantenfuss", "verstopfte"]): return "Fehlerbehebung" if any(k in s for k in ["retraction", "flow-rate", "pressure-advance", "input-shaping", "temperaturturm", "speed-tower", "erste-schicht", "druckbett-leveln"]): return "Kalibrierung" if any(k in s for k in ["adaptive", "modifier", "ironing", "fuzzy", "multi-material", "klipper"]): return "Fortgeschritten" if any(k in s for k in ["erstes-modell", "support", "infill", "duesenwechsel", "druckzeit", "masshaltigkeit", "bruecken", "nachbearbeiten", "gridfinity", "naht"]): return "Grundlagen" return "Allgemein" def difficulty(slug, cat): if any(k in slug for k in ["erstes-modell", "erste-schicht", "druckbett-leveln"]): return "einsteiger" if cat == "Fortgeschritten" or any(k in slug for k in ["klipper", "pressure-advance", "input-shaping", "carbon", "nylon-pa"]): return "experte" return "fortgeschritten" SKIP_SLUGS = { "guide-1-pla-perfekt-einstellen-2026-03-25", "guide-2-stringing-reduzieren-2026-03-25", "guide-1-warping-vermeiden-2026-03-26", "guide-2-petg-ohne-frust-2026-03-26", } def process_html(filepath, out_dir): with open(filepath) as f: raw = f.read() # Frontmatter extrahieren fm_match = re.match(r'^---\n([\s\S]*?)\n---\n([\s\S]*)$', raw) if not fm_match: return None fm_block = fm_match.group(1) html_content = fm_match.group(2).strip() title_m = re.search(r'title:\s*"(.+?)"', fm_block) slug_m = re.search(r'slug:\s*"(.+?)"', fm_block) excerpt_m = re.search(r'excerpt:\s*"(.*?)"', fm_block) title = title_m.group(1) if title_m else os.path.basename(filepath).replace(".html", "") slug = slug_m.group(1) if slug_m else os.path.basename(filepath).replace(".html", "") excerpt = excerpt_m.group(1) if excerpt_m else "" if slug in SKIP_SLUGS: return None # HTML parsen parser = ContentExtractor() parser.feed(html_content) md = parser.get_markdown() # WordPress-Kommentare entfernen md = re.sub(r'', '', md) # Verbleibende Artefakte md = re.sub(r'^\s*\\+\s*$', '', md, flags=re.MULTILINE) md = re.sub(r'\n{3,}', '\n\n', md) md = md.strip() cat = categorize(slug) diff = difficulty(slug, cat) frontmatter = f'---\ntitle: "{title}"\nslug: "{slug}"\ncategory: "{cat}"\ndifficulty: "{diff}"\nexcerpt: "{excerpt}"\n---' out_path = os.path.join(out_dir, f"{slug}.md") with open(out_path, 'w') as f: f.write(f"{frontmatter}\n\n{md}\n") return slug # Main print("=== Komplette Neukonvertierung aus HTML-Backup ===\n") backup_posts = "backup/content/posts" backup_pages = "backup/content/pages" out_guides = "app/src/content/guides" out_pages = "app/src/content/pages" os.makedirs(out_guides, exist_ok=True) os.makedirs(out_pages, exist_ok=True) print("Guides:") ok = 0 for f in sorted(glob.glob(os.path.join(backup_posts, "*.html"))): result = process_html(f, out_guides) if result: print(f" OK: {result}") ok += 1 print(f"\n{ok} Guides.\n") print("Pages:") pok = 0 for f in sorted(glob.glob(os.path.join(backup_pages, "*.html"))): result = process_html(f, out_pages) if result: print(f" OK: {result}") pok += 1 print(f"\n{pok} Pages.\n") # Verify print("=== Verifikation ===") issues = 0 for d in [out_guides, out_pages]: for fp in sorted(glob.glob(os.path.join(d, "*.md"))): with open(fp) as f: c = f.read() # Inline-Tabellen (sollte keine geben) inline = sum(1 for line in c.split("\n") if line.count("|") >= 8 and "| ---" in line) # Backslash-Muell bs = len(re.findall(r'^\s*\\{2,}', c, re.MULTILINE)) # Tabellen vorhanden? tables = c.count("\n| ---") if inline > 0 or bs > 0: print(f" ISSUE: {os.path.basename(fp)} ({inline} inline, {bs} bs)") issues += 1 elif tables > 0: print(f" {os.path.basename(fp)}: {tables} Tabellen OK") print(f"\n{'Alle sauber!' if issues == 0 else f'{issues} Probleme'}")