---
HANDOFF: Session Summary — Docling RAG Pipeline Build
CATEGORY: 00_Overview
FILE: handoff_00_command.md
STATUS: Active
LANE: 00 - Command
LAST_UPDATED: 2026-04-13
OWNER: admin
ALLOWED_CONSUMERS: all tiers
---

# Session Handoff — 2026-04-13
# Docling → RAG Pipeline — Full Build From Zero

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
SECTION 1 — ENVIRONMENT STATE (confirmed)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Host:            gaming-ai-System
IP:              10.0.0.200
OS:              Ubuntu 24.04
User:            kipper
Apps dir:        ~/apps/
NAS:             QNAP NAS2991B8 @ 10.0.0.100
Share:           Vault (SMB 3.0)
Mount:           /mnt/Vault
fstab:           //10.0.0.100/Vault /mnt/Vault cifs credentials=/etc/vault-credentials,rw,vers=3.0,uid=1000,gid=1000,_netdev 0 0
Credentials:     /etc/vault-credentials (chmod 600)
Network:         10.0.0.x / 10GbE
Docker:          Active (172.17.0.1)

Active services:
  Open WebUI          http://10.0.0.200:8080
  code-server         http://10.0.0.200:8081
  text-gen-webui      http://10.0.0.200:7860
  Claude Code 2.1.104 authenticated (Max subscription)
  Postgres 16         localhost:5432

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
SECTION 2 — HARDWARE INVENTORY (locked)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

ACTIVE NODE
  Hostname: gaming-ai-System
  IP:       10.0.0.200
  CPU:      AMD Ryzen 9 9950X
  RAM:      64GB DDR5
  GPUs:     2x RTX 3090 24GB (48GB total VRAM)
  GPU 0:    Consumed by text-gen-webui (21.56GB/24GB)
  Network:  10GbE enp12s0
  Role:     Active workstation / local AI node
  Status:   Online — operational

FUTURE HEAVY INFERENCE NODE
  Hostname: Ai-System
  Platform: TRX40
  RAM:      128GB DDR4-3200
  GPUs:     6x RTX 3090 24GB (144GB total VRAM)
  Role:     Heavy inference / larger model machine
  Status:   Parts purchased — build pending

FUTURE INGESTION FLEET
  Count:    12 rigs
  CPU:      Xeon processors
  GPUs:     5x RTX 3060 12GB per rig (60GB VRAM each)
  Total:    720GB VRAM across fleet
  Config:   2 on-board + 3 on PCIe 3.0 x1 risers
  Storage:  Large NVMe per rig (planned)
  Role:     Document processing / RAG ingestion
  Status:   Idle — need OS bring-up

Fleet split recommendation:
  8  rigs  Docling batch processing
  2  rigs  Fine-tune data prep
  1  rig   NFS master
  1  rig   Monitor + job router

NAS
  Model:   QNAP NAS2991B8 @ 10.0.0.100
  Volume:  DataVol1 (~31TB usable)
  Share:   Vault (SMB 3.0)
  Speed:   10GbE confirmed

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
SECTION 3 — INSTALLED PACKAGES
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

docling                2.86.0
anthropic              0.94.0
sentence-transformers  5.4.0
psycopg2-binary        2.9.11
postgresql             16
postgresql-16-pgvector 0.6.0
Python                 3.12.3
pip flag               --break-system-packages

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
SECTION 4 — ANTHROPIC API
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Key:          ~/.bashrc export ANTHROPIC_API_KEY
Account:      kiphohman@gmail.com
Subscription: Claude Max
Model:        claude-sonnet-4-20250514

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
SECTION 5 — PIPELINE SCRIPTS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

~/apps/docling_claude.py        Test — original single-file tester
~/apps/docling_batch.py         Layer 1 — ingestion
~/apps/docling_chunk_vault.py   Layer 2 — chunking
~/apps/docling_embed_vault.py   Layer 3+4 — embeddings + pgvector

Vault backup: /mnt/Vault/02_AI_Stack/pipelines/scripts/

Run order for new documents:
  1. Drop files into input/
  2. CUDA_VISIBLE_DEVICES="" python3 ~/apps/docling_batch.py
  3. CUDA_VISIBLE_DEVICES="" python3 ~/apps/docling_chunk_vault.py
  4. CUDA_VISIBLE_DEVICES="" python3 ~/apps/docling_embed_vault.py
  5. Query via match_chunks() in Postgres

NOTE: Always use CUDA_VISIBLE_DEVICES="" — GPU 0 reserved for text-gen-webui

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
SECTION 6 — VAULT STRUCTURE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

/mnt/Vault/
├── 00_Overview/
├── 01_Infrastructure/
├── 02_AI_Stack/
│   └── pipelines/
│       ├── data_ingestion.md
│       ├── data_ingestion_rag_concepts.md
│       ├── scripts/
│       └── data_ingestion/
│           ├── input/
│           ├── output/
│           ├── chunks/
│           └── logs/
├── 03_Networking/
├── 04_Security/
├── 05_Operations/
├── 06_Research/
└── 07_Archive/

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
SECTION 7 — DATABASE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Engine:    PostgreSQL 16 + pgvector 0.6.0
Host:      localhost:5432
Database:  ai_studio
User:      vault_user
Password:  vault_secure_2026
Tables:    documents, chunks
Embedding: sentence-transformers/all-MiniLM-L6-v2 (384 dim)
Chunks:    256 token max, avg 111, merge_peers=True

Search:
  match_chunks(query_embedding vector(384), match_count INT DEFAULT 5)
  Returns: id, document_id, content, similarity FLOAT

Loaded:
  TRX40 PRO 10G Motherboard.pdf — 305 chunks — 2026-04-13

Test confirmed:
  Query: "What memory slots should I use for 4 DIMMs?"
  Result 1 similarity: 0.678 — correct answer returned

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
SECTION 8 — PIPELINE ARCHITECTURE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

PDF/DOCX/PPTX/HTML/XLSX
      ↓
[Layer 1] docling_batch.py → .md + _summary.md
      ↓
[Layer 2] docling_chunk_vault.py → _chunks.txt
      ↓
[Layer 3+4] docling_embed_vault.py → Postgres pgvector
      ↓
[Layer 5 — NEXT] RAG Agent → match_chunks() → LLM answer

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
SECTION 9 — OPEN ITEMS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

NEXT SESSION:
  [ ] Layer 5 RAG agent — wire match_chunks() to LLM
  [ ] Framework decision: LangChain vs Pydantic AI

INFRASTRUCTURE:
  [ ] Build Ai-System (TRX40 / 6x3090)
  [ ] Bring 12 Xeon rigs online (Ubuntu install)
  [ ] Static IP for gaming-ai-System
  [ ] Snapshot schedule on Vault
  [ ] Tailscale node list and access boundaries

DATABASE:
  [ ] HNSW index: CREATE INDEX ON chunks USING hnsw (embedding vector_cosine_ops)
  [ ] Tune chunker token warning (3282 > 512)
  [ ] Load domain documents (spring specs, Inconel/Monel datasheets)

FUTURE:
  [ ] Fine-tuning environment (Unsloth or Axolotl on 3090s)
  [ ] GPU lane assignment per service
  [ ] Cloud backup decision (Backblaze B2 candidate)

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
SECTION 10 — DESIGN RULES
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

1. Canonical record first — Vault before model packets
2. Models retrieve from Vault .md — do not re-read long chats
3. Networking closed — 10GbE, NAS 10.0.0.100, Tailscale on NAS
4. Cloud not in critical path — on-prem first
5. Ai-System is additive — does not replace gaming-ai-System
6. Always CUDA_VISIBLE_DEVICES="" for Docling — GPU 0 reserved
7. Vault is source of truth — scripts backed up to 02_AI_Stack/pipelines/scripts/
8. Goal: Solutions-based AI — Inconel/Monel/valve specs → fine-tuned local LLM

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
END OF HANDOFF — 00-Command
Next: Layer 5 RAG Agent build
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━