Layer 1
Log Cleaning Scripts
Raw session logs are noisy — full of ANSI escape codes, TUI spinner text, and control characters. These Python scripts clean them into digestible conversation text.
iTerm2 Session Log Cleaner
If you use iTerm2's "Automatically log session input/output" feature, your logs will be full of terminal noise. This script strips timestamps, ANSI codes, TUI elements (spinners, status bars), and reassembles single-character keystrokes back into typed words.
#!/usr/bin/env python3
"""
iTerm2 session log cleaner for Claude Code digest pipeline.
Strategy: strip timestamps and control chars, then KEEP only lines that
look like real conversational content. Everything else is TUI noise.
Usage:
python3 clean-session-log.py <input.log> [output.txt]
python3 clean-session-log.py <input.log> # prints to stdout
"""
import re, sys
from pathlib import Path
# --- Timestamp prefix ---
TS_RE = re.compile(r"^\[[\d/]+,\s*[\d:.]+ [AP]M\]\s*")
# --- Control character stripping ---
ANSI_RE = re.compile(r"\x1b\[[0-9;]*[A-Za-z]|\x1b\].*?(\x07|\x1b\\)")
CTRL_CHARS_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]")
TUI_FILLER_RE = re.compile(r"[\u2500-\u257f\u2580-\u259f\u25a0-\u25ff]{2,}")
# --- Noise patterns (lines to DROP) ---
NOISE_PATTERNS = [
# TUI spinner verbs (Claude Code rotates these)
re.compile(r"^[·✢✳✶✻✽❯⏺\s]*(Embellishing|Grooving|Orbiting|Composing|"
r"Thinking|Reflecting|Musing|Pondering|Weaving|Crafting|"
r"Sculpting|Harmonizing|Polishing|Illuminating|Reasoning|"
r"Brainstorming|Synthesizing|Considering|Generating|"
r"Processing|Analyzing|Exploring|Initializing|Loading)"),
re.compile(r"--\s*(INSERT|NORMAL|VISUAL|REPLACE)\s*--"),
re.compile(r"(Opus\s*4\.\d|Sonnet\s*4\.\d|Haiku\s*4\.\d)"),
re.compile(r"^lbruton@\S+.*%"), # shell prompts
re.compile(r"^(Last login:|You have mail)"),
re.compile(r"claude\.ai/code/session_"),
re.compile(r"Do you want to proceed\?|Esc\s*to\s*cancel"),
re.compile(r"^(Read|Edit|Write|Bash|Glob|Grep)\s*(file)?\s*$"),
]
WORD_RE = re.compile(r"\b[a-zA-Z]{2,}\b")
def strip_line(line):
"""Remove timestamp, ANSI codes, and control chars."""
line = TS_RE.sub("", line)
line = ANSI_RE.sub("", line)
line = CTRL_CHARS_RE.sub("", line)
line = TUI_FILLER_RE.sub("", line)
return re.sub(r" {3,}", " ", line).rstrip()
def is_noise(line):
stripped = line.strip()
if not stripped or len(stripped) < 4:
return True
return any(pat.search(stripped) for pat in NOISE_PATTERNS)
def has_content(line):
stripped = line.strip()
if "`" in stripped or "/" in stripped:
return len(WORD_RE.findall(stripped)) >= 2
return len(WORD_RE.findall(stripped)) >= 3
def reassemble_keystrokes(lines):
"""Collapse single-char keystroke lines into assembled text."""
result, buf = [], []
for line in lines:
stripped = line.strip()
if len(stripped) == 1 and stripped.isprintable() and ord(stripped) < 128:
buf.append(stripped)
else:
if buf:
word = "".join(buf)
if len(word) > 2:
result.append(f"[user typed: {word}]")
buf = []
result.append(line)
if buf and len("".join(buf)) > 2:
result.append(f"[user typed: {''.join(buf)}]")
return result
def clean_log(raw_text):
lines = [strip_line(l) for l in raw_text.splitlines()]
lines = reassemble_keystrokes(lines)
cleaned = [l for l in lines if not is_noise(l) and has_content(l)]
# Collapse blank runs
result, blanks = [], 0
for line in cleaned:
if not line.strip():
blanks += 1
if blanks <= 1: result.append("")
else:
blanks = 0
result.append(line)
return "\n".join(result).strip()
if __name__ == "__main__":
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <input.log> [output.txt]", file=sys.stderr)
sys.exit(1)
raw = Path(sys.argv[1]).read_text(errors="replace")
cleaned = clean_log(raw)
if len(sys.argv) >= 3:
Path(sys.argv[2]).write_text(cleaned)
r, c = len(raw.splitlines()), len(cleaned.splitlines())
print(f"Cleaned: {r} -> {c} lines ({(r-c)/r*100:.0f}% reduction)", file=sys.stderr)
else:
print(cleaned)
JSONL Transcript Extractor
Claude Code stores session transcripts as JSONL files in ~/.claude/projects/. This script extracts clean conversation text with actor attribution, tool call summaries (not full output), and session metadata. It filters out thinking blocks, tool results (which contain entire file contents), and system injections.
#!/usr/bin/env python3
"""
Extract clean conversation text from Claude Code JSONL transcripts.
Output format:
### User
<user message text>
### Assistant
<assistant response text>
### Tool: <tool_name>
<brief tool context — name + key input, NOT full output>
Filters out: thinking blocks, tool_result payloads, system messages.
"""
import json, sys
from datetime import datetime
from pathlib import Path
def extract_session_metadata(entries):
meta = {"project": "unknown", "branch": "", "cwd": "",
"start_time": "", "end_time": "",
"tool_count": 0, "tools_used": set()}
for entry in entries:
if not meta["cwd"] and entry.get("cwd"):
meta["cwd"] = entry["cwd"]
if not meta["branch"] and entry.get("gitBranch"):
meta["branch"] = entry["gitBranch"]
ts = entry.get("timestamp", "")
if ts:
if not meta["start_time"] or ts < meta["start_time"]:
meta["start_time"] = ts
if not meta["end_time"] or ts > meta["end_time"]:
meta["end_time"] = ts
msg = entry.get("message", {})
if isinstance(msg, dict):
for block in (msg.get("content") or []):
if isinstance(block, dict) and block.get("type") == "tool_use":
meta["tool_count"] += 1
meta["tools_used"].add(block.get("name", ""))
# Derive project from cwd
cwd = meta["cwd"]
if cwd:
parts = cwd.rstrip("/").split("/")
meta["project"] = parts[-1] if parts else "unknown"
meta["tools_used"] = sorted(meta["tools_used"])
return meta
def extract_conversation(jsonl_path):
entries = []
for line in jsonl_path.read_text(errors="replace").splitlines():
line = line.strip()
if not line: continue
try: entries.append(json.loads(line))
except json.JSONDecodeError: continue
if not entries: return "", {}
meta = extract_session_metadata(entries)
parts = [f"--- Session: {meta['project']} | "
f"branch={meta['branch']} | "
f"tools={','.join(meta['tools_used'][:10])} ---\n"]
for entry in entries:
etype = entry.get("type", "")
if etype == "user":
content = entry.get("message", {}).get("content", "")
if isinstance(content, str):
text = content.strip()
elif isinstance(content, list):
text = "\n".join(
b["text"] for b in content
if isinstance(b, dict) and b.get("type") == "text"
).strip()
else: continue
if text: parts.append(f"### User\n{text}\n")
elif etype == "assistant":
content = entry.get("message", {}).get("content", [])
if not isinstance(content, list): continue
texts, tools = [], []
for block in content:
if not isinstance(block, dict): continue
if block.get("type") == "text":
t = block.get("text", "").strip()
if t: texts.append(t)
elif block.get("type") == "tool_use":
name = block.get("name", "?")
inp = block.get("input", {})
ctx = ""
if name in ("Read","Write","Edit"):
ctx = inp.get("file_path", "")
elif name == "Bash":
cmd = inp.get("command", "")
ctx = cmd[:120] + ("..." if len(cmd) > 120 else "")
elif name == "Grep":
ctx = f'pattern="{inp.get("pattern","")}"'
elif name == "Agent":
ctx = inp.get("description", "")[:80]
tools.append(f"[Tool: {name} -> {ctx}]" if ctx
else f"[Tool: {name}]")
if texts: parts.append("### Assistant\n" + "\n".join(texts) + "\n")
if tools: parts.append(" " + " ".join(tools) + "\n")
return "\n".join(parts), meta
if __name__ == "__main__":
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <session.jsonl> [output.txt]",
file=sys.stderr)
sys.exit(1)
text, meta = extract_conversation(Path(sys.argv[1]))
if len(sys.argv) >= 3:
Path(sys.argv[2]).write_text(text)
else:
print(text)
Where are the logs?
iTerm2 logs: Enable in iTerm2 Preferences → Profiles → Session → "Automatically log session input/output." Logs land in your configured directory (e.g.,
~/.claude/iterm2/).
JSONL transcripts: Claude Code stores these automatically at
~/.claude/projects/<project-hash>/*.jsonl. Each session is one file.