Source code for grilly.datasets.clean_conversations

"""
Conversations SVC Cleaner

Cleans the conversations_svc_semantic.jsonl file by removing:
- Leaked filenames (``*.py``, ``*.pt``, ``*.txt``, etc.)
- File paths (Unix/Windows)
- Code artifacts (&&, backticks, arrows, shell commands)
- Technical numbers (checkpoint IDs, dimension specs)
- Leaked project names (AURA, STDP, etc.)
- Entries that become too short or invalid after cleaning

Output: conversations_svc_cleaned.jsonl
"""

import json
import re
import sys
from collections import Counter
from pathlib import Path

DATA_DIR = Path(__file__).parent / "_data"
INPUT_FILE = DATA_DIR / "conversations_svc_semantic.jsonl"
OUTPUT_FILE = DATA_DIR / "conversations_svc_cleaned.jsonl"

# Patterns to clean
# File extensions
FILE_EXT_PATTERN = re.compile(
    r"\b\w+\.(py|pt|txt|json|jsonl|js|ts|sh|yaml|yml|toml|cfg|ini|md|csv|log|bin|pkl|pth|ckpt|safetensors|onnx|h5|npz|npy)\b",
    re.IGNORECASE,
)

# File paths (Unix and Windows)
UNIX_PATH_PATTERN = re.compile(r"(?:/[\w._-]+){2,}/?")
WINDOWS_PATH_PATTERN = re.compile(r"[A-Z]:\\(?:[\w._-]+\\)*[\w._-]+")

# Code artifacts
CODE_PATTERNS = [
    re.compile(r"&&"),  # shell chaining
    re.compile(r"`[^`]+`"),  # backtick code
    re.compile(r"python3?\s"),  # python commands
    re.compile(r"\bpip\s+install\b"),  # pip install
    re.compile(r"\bcd\s+\S+"),  # cd commands
    re.compile(r"\bgit\s+\w+"),  # git commands
    re.compile(r"\bnpm\s+\w+"),  # npm commands
    re.compile(r"\bsudo\s"),  # sudo
    re.compile(r"\$\{?\w+\}?"),  # shell variables
    re.compile(r"import\s+\w+"),  # python imports
    re.compile(r"from\s+\w+\s+import"),  # python from imports
    re.compile(r"def\s+\w+\s*\("),  # function defs
    re.compile(r"class\s+\w+[:\(]"),  # class defs
]

# Dimension specs and technical numbers
DIMENSION_PATTERN = re.compile(r"\b\d+\s*[\u2192\u2190\u2194→←]\s*\d+")  # 64→128
CHECKPOINT_PATTERN = re.compile(
    r"\b(?:checkpoint|ckpt|epoch|step|line|iteration)[\s_]*[\d,._]+", re.IGNORECASE
)
LARGE_TECHNICAL_NUMBER = re.compile(r"\b\d{1,3}(?:,\d{3})+\b")  # 257,450 etc.

# Leaked project/model names
LEAKED_NAMES = re.compile(
    r"\b(?:AURA|STDP|stdp|phasic|neuromorphic|Vulkan|vulkan|GLSL|glsl|GrillCheese|grilly|grillcheese)\b",
    re.IGNORECASE,
)

# Bracket placeholders already in data - keep these
PLACEHOLDER_PATTERN = re.compile(r"\[(?:PROJECT_NAME|FILE_PATH|VOLUME_PATH|ID_\d+|FILE|PATH|NUM)\]")


[docs]def clean_text(text: str) -> str: """Apply all cleaning rules to a text string.""" # Replace file extensions with [FILE] text = FILE_EXT_PATTERN.sub("[FILE]", text) # Replace file paths text = UNIX_PATH_PATTERN.sub("[PATH]", text) text = WINDOWS_PATH_PATTERN.sub("[PATH]", text) # Remove code artifacts entirely for pattern in CODE_PATTERNS: text = pattern.sub("", text) # Replace dimension specs text = DIMENSION_PATTERN.sub("[DIM]", text) # Replace checkpoint references text = CHECKPOINT_PATTERN.sub("[CHECKPOINT]", text) # Replace large technical numbers text = LARGE_TECHNICAL_NUMBER.sub("[NUM]", text) # Replace leaked project names text = LEAKED_NAMES.sub("[PROJECT_NAME]", text) # Collapse multiple spaces text = re.sub(r"\s+", " ", text).strip() # Collapse multiple placeholders in a row text = re.sub( r"(\[(?:FILE|PATH|NUM|DIM|CHECKPOINT|PROJECT_NAME)\]\s*){3,}", "[REDACTED] ", text ) return text
[docs]def clean_svc(svc: dict) -> dict: """Clean SVC fields.""" return { "s": clean_text(svc.get("s", "")), "v": clean_text(svc.get("v", "")), "c": clean_text(svc.get("c", "")), }
[docs]def is_valid_after_cleaning(entry: dict) -> bool: """Check if an entry is still valid after cleaning.""" text = entry.get("text", "") # Text must have at least 3 real words (not just placeholders) real_words = [w for w in text.split() if not w.startswith("[") and len(w) > 1] if len(real_words) < 3: return False # Verb must still be present svc = entry.get("svc", {}) verb = svc.get("v", "").strip() if not verb: return False return True
[docs]def main(): if sys.stdout.encoding != "utf-8": sys.stdout.reconfigure(encoding="utf-8") print(f"Cleaning: {INPUT_FILE.name}") print(f"Output: {OUTPUT_FILE.name}") total = 0 kept = 0 dropped = 0 cleaned_count = 0 # entries where text was modified realm_counter = Counter() drop_reasons = Counter() with ( open(INPUT_FILE, encoding="utf-8") as fin, open(OUTPUT_FILE, "w", encoding="utf-8") as fout, ): for line in fin: line = line.strip() if not line: continue total += 1 try: entry = json.loads(line) except json.JSONDecodeError: drop_reasons["json_error"] += 1 dropped += 1 continue original_text = entry.get("text", "") # Clean text and SVC entry["text"] = clean_text(entry["text"]) entry["svc"] = clean_svc(entry.get("svc", {})) # Clean lemmas too (may contain leaked names) if "lemmas" in entry: entry["lemmas"] = [ clean_text(l) if LEAKED_NAMES.search(l) else l for l in entry["lemmas"] ] # Track if we changed anything if entry["text"] != original_text: cleaned_count += 1 # Validate after cleaning if not is_valid_after_cleaning(entry): drop_reasons["too_short_or_invalid"] += 1 dropped += 1 continue realm_counter[entry.get("realm", "?")] += 1 fout.write(json.dumps(entry, ensure_ascii=False) + "\n") kept += 1 if total % 20000 == 0: print(f" Processed {total:,}... kept {kept:,}, dropped {dropped:,}") print("\n--- Results ---") print(f" Total input: {total:,}") print(f" Kept: {kept:,} ({kept / total * 100:.1f}%)") print(f" Dropped: {dropped:,} ({dropped / total * 100:.1f}%)") print(f" Modified: {cleaned_count:,} ({cleaned_count / total * 100:.1f}%)") print(f" Drop reasons: {dict(drop_reasons)}") print(f" Realms: {dict(realm_counter.most_common())}") print(f"\nOutput: {OUTPUT_FILE}")
if __name__ == "__main__": main()