#!/usr/bin/env python3 """Filter and copy published Obsidian notes to a Quartz content directory.""" import argparse import re import shutil import sys from pathlib import Path from typing import Optional import yaml def parse_frontmatter(content: str) -> tuple[Optional[dict], str]: """Parse YAML frontmatter from markdown content. Returns (frontmatter_dict, body_after_closing_dashes) or (None, content). Body is everything after the closing '---', including the leading newline. """ if not content.startswith("---\n") and not content.startswith("---\r\n"): return None, content # Skip the opening "---\n" rest = content[4:] end_idx = rest.find("\n---") if end_idx == -1: return None, content fm_text = rest[:end_idx] # body starts after the "\n---" marker body = rest[end_idx + 4:] try: fm = yaml.safe_load(fm_text) if not isinstance(fm, dict): return None, content return fm, body except yaml.YAMLError: return None, content def inject_publish(content: str, title: str | None = None) -> str: """Inject ``publish: true`` (and optionally ``title``) into existing YAML frontmatter. If 'publish' already exists it is overwritten. When *title* is provided, it is written as the ``title`` key (unless there is an existing value). Returns content unchanged if there is no valid frontmatter. """ fm, body = parse_frontmatter(content) if fm is None: return content fm["publish"] = True if title and not fm.get("title"): fm["title"] = title new_fm = yaml.dump( fm, allow_unicode=True, default_flow_style=False, sort_keys=False ).strip() return f"---\n{new_fm}\n---{body}" def find_asset_references(content: str, source_assets_dir: Path) -> set[Path]: """Find asset references in *content* that resolve to real files. Handles two syntaxes: * ``![alt text](assets/path/to/file.ext)`` — standard Markdown image * ``![[filename.ext]]`` or ``![[filename.ext|width]]`` — Obsidian embed Only paths that actually exist under *source_assets_dir* are returned. Returns a set of :class:`Path` objects relative to *source_assets_dir*. """ found: set[Path] = set() # Standard Markdown: ![alt](assets/relative/path.ext) for match in re.finditer(r"!\[.*?\]\(assets/([^)\s]+)\)", content): rel = match.group(1) candidate = source_assets_dir / rel if candidate.is_file(): found.add(Path(rel)) # Obsidian wikilink embed: ![[name.ext]] or ![[name.ext|display]] for match in re.finditer(r"!\[\[([^\]|]+?)(?:\|[^\]]*)?\]\]", content): ref = match.group(1).strip() suffix = Path(ref).suffix.lower() # Skip if no extension or a Markdown file (it's an embedded note) if not suffix or suffix == ".md": continue # 1. Try exact path inside assets dir (supports ![[sub/file.png]]) exact = source_assets_dir / ref if exact.is_file(): found.add(Path(ref)) continue # 2. Search recursively by filename (Obsidian stores by unique name) filename = Path(ref).name for hit in source_assets_dir.rglob(filename): if hit.is_file(): found.add(hit.relative_to(source_assets_dir)) break # Use first match return found def build_rename_mapping(source_dir: Path) -> dict[str, str]: """Build a case-insensitive mapping from original stem to output stem. Pass 1: scan all published ``.md`` files and determine output filenames. * If the note has a ``slug`` frontmatter key → new stem = slug value. * Otherwise → new stem = original filename stem lowercased. The returned dict uses lowercased stems as keys so that lookups from wikilinks (which are case-insensitive in Obsidian) work correctly. A warning is printed when two published notes would map to the same output stem. """ mapping: dict[str, str] = {} # lowercase_stem → new_stem reverse: dict[str, str] = {} # new_stem → original key (for collision check) for md_file in sorted(source_dir.rglob("*.md")): rel_path = md_file.relative_to(source_dir) if rel_path.parts[0] == "assets": continue content = md_file.read_text(encoding="utf-8") fm, _ = parse_frontmatter(content) if fm is None or fm.get("published") is not True: continue original_stem = md_file.stem slug = fm.get("slug") new_stem = str(slug) if slug else original_stem.lower() key = original_stem.lower() if new_stem in reverse: print( f"Warning: output stem collision '{new_stem}': " f"'{reverse[new_stem]}' and '{key}' both map to the same filename." ) mapping[key] = new_stem reverse[new_stem] = key return mapping def rewrite_wikilinks(content: str, rename_mapping: dict[str, str]) -> str: """Rewrite wikilink targets in *content* using *rename_mapping*. Handles all four wikilink forms (with and without ``!`` prefix): * ``[[OldName]]`` → ``[[new-name]]`` * ``[[OldName|Title]]`` → ``[[new-name|Title]]`` * ``[[OldName#heading]]`` → ``[[new-name#heading]]`` * ``[[OldName#heading|Title]]`` → ``[[new-name#heading|Title]]`` Only rewrites when the link target (case-insensitive, without ``.md`` extension) is present as a key in *rename_mapping*. Asset embeds such as ``![[image.png]]`` are left untouched because non-markdown extensions will not appear in the mapping. """ # Groups: (1) prefix ('[[' or '![['), (2) target, (3) heading, (4) title pattern = re.compile( r"(!?\[\[)([^\]|#\n]+?)(?:#([^\]|\n]*))?(?:\|([^\]\n]*))?\]\]" ) def _replace(m: re.Match) -> str: prefix = m.group(1) target = m.group(2).strip() heading = m.group(3) # None when absent title = m.group(4) # None when absent # Strip .md suffix for the lookup target_stem = target[:-3] if target.lower().endswith(".md") else target new_stem = rename_mapping.get(target_stem.lower()) if new_stem is None: # Check if this is an asset reference (has non-md file extension) has_ext = "." in target is_md = target.lower().endswith(".md") if has_ext and not is_md: # Asset reference (e.g., ![[image.png]]) — leave unchanged return m.group(0) # Dead link to unpublished/non-existing page — render as styled text display = title if title is not None else target_stem return f'{display}' result = prefix + new_stem if heading is not None: result += f"#{heading}" if title is not None: result += f"|{title}" elif target_stem != new_stem: # Auto-add the original name as display title when renaming result += f"|{target_stem}" result += "]]" return result return pattern.sub(_replace, content) def filter_notes(source_dir: Path, dest_dir: Path) -> None: """Filter and copy published notes and their referenced assets.""" notes_copied = 0 notes_skipped = 0 assets_copied = 0 all_asset_refs: set[Path] = set() source_assets_dir = source_dir / "assets" # Pass 1: build the rename mapping for all published notes rename_mapping = build_rename_mapping(source_dir) # Pass 2: copy each published note with rewritten links and new filename for md_file in sorted(source_dir.rglob("*.md")): rel_path = md_file.relative_to(source_dir) # Skip anything inside the assets/ directory if rel_path.parts[0] == "assets": continue content = md_file.read_text(encoding="utf-8") fm, _ = parse_frontmatter(content) if fm is None or fm.get("published") is not True: notes_skipped += 1 continue # Determine output filename slug = fm.get("slug") new_stem = str(slug) if slug else md_file.stem.lower() new_filename = new_stem + ".md" # Inject publish flag (+ title from filename) then rewrite wikilinks out_content = rewrite_wikilinks( inject_publish(content, title=md_file.stem), rename_mapping ) dest_file = dest_dir / rel_path.parent / new_filename dest_file.parent.mkdir(parents=True, exist_ok=True) dest_file.write_text(out_content, encoding="utf-8") notes_copied += 1 # Collect asset references from the *original* content if source_assets_dir.is_dir(): all_asset_refs.update(find_asset_references(content, source_assets_dir)) # Copy referenced assets dest_assets_dir = dest_dir / "assets" for asset_rel in all_asset_refs: src = source_assets_dir / asset_rel dst = dest_assets_dir / asset_rel dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) assets_copied += 1 print( f"Done: {notes_copied} notes copied, " f"{assets_copied} assets copied, " f"{notes_skipped} notes skipped." ) def main() -> None: parser = argparse.ArgumentParser( description="Copy published Obsidian notes to a Quartz content directory." ) parser.add_argument("source_dir", type=Path, help="Source Obsidian vault directory") parser.add_argument( "dest_dir", type=Path, help="Destination Quartz content directory" ) args = parser.parse_args() if not args.source_dir.is_dir(): print( f"Error: source_dir '{args.source_dir}' is not a directory", file=sys.stderr, ) sys.exit(1) filter_notes(args.source_dir, args.dest_dir) if __name__ == "__main__": main()