Skip to main content

Firestore Schema

DocWeb stores its data in Google Cloud Firestore with the following collections and schemas.

DiscoveredUrlDoc

Stores discovered URLs with metadata, clustering, and scraped content.

Collection: artifacts/{userId}/public/data/urls/{docId}

interface DiscoveredUrlDoc {
// Core identification
url: string; // Full URL
docId: string; // Base64url-encoded unique ID (max 128 chars)

// Discovery metadata
discoverySource: "sitemap" | "sitemap-index" | "navigation" |
"content-link" | "footer" | "header";
foundAt?: string; // Parent URL (for navigation sources)

// Nav-Scrape metadata (structural anchor nodes)
isStructuralNode: boolean; // Found in nav/aside/header
importanceScore: number; // 1.0 for anchor nodes, 0.5 default

// Clustering (OVERLAPPING - URLs can belong to multiple clusters)
clusterIds: string[]; // e.g., ["/features", "/api"]
clusterLabels: string[]; // e.g., ["Features", "API"]
primaryClusterId: string; // Main cluster (first path segment)
primaryClusterLabel: string; // Main cluster label

// Hierarchy (for force-graph links)
path_segments: string[]; // e.g., ["docs", "api", "users"]
depth: number; // Path depth
parentPath: string; // e.g., "/docs/api"
parentId: string | null; // docId of parent URL

// Priority
priority: "critical" | "high" | "medium" | "low";
priorityScore: number; // 0-100 (lower = more important)
isDocumentation: boolean;

// Classification (Inferred Semantic Framework)
contentCategory: "primary" | "support" | "utility" | "unknown";

// Session tracking
sessionId: string;
searchId: string;
rootUrl: string;
status: "discovered" | "scraping" | "completed" | "error" | "archived";

// Language detection
language: string; // e.g., "en", "fr", "zh-cn"

// Scraped content
title?: string;
description?: string; // First 500 chars
mainContent?: string; // Markdown (max 50KB)
scrapingStatus: "pending" | "scraping" | "complete" | "failed" | "skipped";
lastScraped?: Date;
scrapeError?: string;

// Timestamps
discovered_at: Date;
updated_at: Date;
}

SavedSession

Stores user's saved discovery sessions.

Collection: artifacts/{userId}/public/data/sessions/{sessionId}

interface SavedSession {
sessionId: string; // Same as searchId
userId: string;
domain: string; // e.g., "docs.stripe.com"
rootUrl: string;
title: string;
urlCount: number;
scrapedCount: number;
messageCount: number;
createdAt: Date;
updatedAt: Date;
lastAccessedAt: Date;
status: "active" | "archived";
}

GlobalEmbeddingDocument

Stores vector embeddings for semantic search (shared across all users).

Collection: artifacts/global/embeddings/{urlHash}_chunk{N}

interface GlobalEmbeddingDocument {
urlHash: string; // SHA256 hash (16 chars)
url: string;
domain: string; // For domain-scoped queries
chunkIndex: number;
chunkText: string; // First 2000 chars of chunk
embedding: number[]; // 768-dim vector
title: string;
description: string;
searchIds: string[]; // Sessions referencing this
contentHash: string; // MD5 of content (12 chars)
createdAt: Timestamp;
updatedAt: Timestamp;
}

DomainCache

Caches discovered URLs for entire domains.

Collection: artifacts/global/domainCache/{domainHash}

interface DomainCache {
domain: string;
urls: string[];
clusters: ClusterStats[];
discoveryStats: DiscoveryStats;
source: "waterfall" | "cache";
discoveredAt: Timestamp;
expiresAt: Timestamp; // 24-hour TTL
}

PageCache

Caches scraped page content.

Collection: artifacts/global/pageCache/{urlHash}

interface PageCache {
url: string;
domain: string;
title: string;
description: string;
mainContent: string;
contentHash: string; // MD5 (12 chars)
scrapedAt: Timestamp;
expiresAt: Timestamp; // 24-hour TTL
}

UserCreditsProfile

Stores user account and billing information.

Collection: users/{userId}

interface UserCreditsProfile {
email: string;
tier: "free" | "pro" | "max";
creditsUsedToday: number;
creditsRemaining: number;
creditsPerDay: number;
maxSessions: number;
nextResetTime: string; // ISO string (midnight UTC)
totalDiscoveries: number;
totalChatMessages: number;
stripeCustomerId?: string;
subscriptionId?: string;
subscriptionStatus?: "active" | "canceled" | "past_due";
}

Collection Hierarchy

firestore/
├── artifacts/
│ ├── {userId}/
│ │ └── public/
│ │ └── data/
│ │ ├── urls/{docId} # DiscoveredUrlDoc
│ │ └── sessions/{sessionId} # SavedSession
│ └── global/
│ ├── domainCache/{domainHash} # DomainCache
│ ├── pageCache/{urlHash} # PageCache
│ └── embeddings/{urlHash}_chunk{N} # GlobalEmbeddingDocument
└── users/
└── {userId} # UserCreditsProfile

Indexes

Required composite indexes (firestore.indexes.json):

{
"indexes": [
{
"collectionGroup": "pageCache",
"fields": [
{ "fieldPath": "domain", "order": "ASCENDING" },
{ "fieldPath": "scrapedAt", "order": "DESCENDING" }
]
}
]
}