From ce2ed72f8f0fd386541b8d7c87f2845727d31c6c Mon Sep 17 00:00:00 2001 From: Luis Gaspar Schroeder Date: Sat, 15 Nov 2025 16:05:02 -0800 Subject: [PATCH 1/4] Added SemBenchmarkCombo to ReadMe --- README.md | 17 +++++++++-------- benchmarks/ReadMe.md | 11 ++++++----- tests/ReadMe.md | 2 +- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 79b705e..c9e049f 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ vCache is the first semantic prompt cache that guarantees user-defined error rat > vCache uses OpenAI by default for both LLM inference and embedding generation, but you can configure any other inference setup. -## ๐Ÿš€ Quick Install +## Quick Install Install vCache in editable mode: @@ -66,10 +66,10 @@ print(response) ``` -## ๐ŸŽฌ How vCache Works +## How vCache Works vCache intelligently detects when a new prompt is semantically equivalent to a cached one, and adapts its decision boundaries based on your accuracy requirements. -This lets it return cached model responses for semantically similar promptsโ€”not just exact matchesโ€”reducing both inference latency and cost without sacrificing correctness. +This lets it return cached model responses for semantically similar prompts (not just exact matches) reducing both inference latency and cost without sacrificing correctness.

vCache Visualization @@ -95,7 +95,7 @@ Applications can range from agentic systems and RAG pipelines to database system -## โš™๏ธ Advanced Configuration +## Advanced Configuration > [NOTE] > vCache is currently in active development. Features and APIs may change as we continue to improve the system. @@ -160,13 +160,13 @@ vCache supports FIFO, LRU, MRU, and a custom SCU eviction policy. See the [Evict -## ๐Ÿ›  Developer Guide +## Developer Guide For development setup and contribution guidelines, see [CONTRIBUTING.md](CONTRIBUTING.md). -## ๐Ÿ“Š Benchmarking vCache +## Benchmarking vCache vCache includes a benchmarking framework to evaluate: - **Cache hit rate** @@ -174,14 +174,15 @@ vCache includes a benchmarking framework to evaluate: - **Latency improvement** - **...** -We provide three open benchmarks: +We provide four open benchmarks: - **SemCacheLmArena** (chat-style prompts) - [Dataset โ†—](https://huggingface.co/datasets/vCache/SemBenchmarkLmArena) - **SemCacheClassification** (classification queries) - [Dataset โ†—](https://huggingface.co/datasets/vCache/SemBenchmarkClassification) - **SemCacheSearchQueries** (real-world search logs) - [Dataset โ†—](https://huggingface.co/datasets/vCache/SemBenchmarkSearchQueries) +- **vCache/SemBenchmarkCombo** (combines SemBenchmarkLmArena with SemBenchmarkSearchQueries with no-cache-hit scenarios) - [Dataset โ†—](https://huggingface.co/datasets/vCache/SemBenchmarkCombo) See the [Benchmarking Documentation](benchmarks/ReadMe.md) for instructions. -## ๐Ÿ“„ Citation +## Citation If you use vCache for your research, please cite our [paper](https://arxiv.org/abs/2502.03771). diff --git a/benchmarks/ReadMe.md b/benchmarks/ReadMe.md index b56712e..238ec59 100644 --- a/benchmarks/ReadMe.md +++ b/benchmarks/ReadMe.md @@ -20,7 +20,7 @@ This directory provides the official benchmarking tools for evaluating the perfo -## โš™๏ธ Installation +## Installation To enable benchmarking capabilities, install vCache with the `benchmarks` extras from the project root: @@ -29,7 +29,7 @@ pip install -e .[benchmarks] ``` -## ๐Ÿš€ Running Benchmarks +## Running Benchmarks Run the main benchmarking script from the project root: @@ -40,7 +40,7 @@ python benchmarks/benchmark.py The script will automatically download the required datasets from Hugging Face based on the configurations in `RUN_COMBINATIONS`. -## โš™๏ธ Custom Configuration +## Custom Configuration The primary configuration is done by modifying the global variables in the `benchmarks/benchmark.py` script. This script is designed to benchmark the performance of vCache against several baselines by evaluating cache hit rates, accuracy, latency, and other metrics. @@ -64,7 +64,7 @@ Refer to the docstring in `benchmarks/benchmark.py` for more details on other co -## ๐Ÿ“ Datasets +## Datasets ### vCache Datasets @@ -73,6 +73,7 @@ The official benchmark datasets are hosted on Hugging Face and will be downloade - **`vCache/SemBenchmarkLmArena`** (chat-style prompts): [Dataset โ†—](https://huggingface.co/datasets/vCache/SemBenchmarkLmArena) - **`vCache/SemBenchmarkClassification`** (structured queries): [Dataset โ†—](https://huggingface.co/datasets/vCache/SemBenchmarkClassification) - **`vCache/SemBenchmarkSearchQueries`** (real-world browser searches): [Dataset โ†—](https://huggingface.co/datasets/vCache/SemBenchmarkSearchQueries) +- **`vCache/SemBenchmarkCombo`** (combines SemBenchmarkLmArena with SemBenchmarkSearchQueries with no-cache-hit scenarios): [Dataset โ†—](https://huggingface.co/datasets/vCache/SemBenchmarkCombo) ### Custom Datasets @@ -120,7 +121,7 @@ You can benchmark vCache on your own datasets. The script supports `.csv` and `. ``` -## ๐Ÿ“ฆ Output +## Output Benchmark results are saved to the `benchmarks/results/` directory, organized by dataset, embedding model, and LLM. For each run, the output includes: - **JSON files** containing raw data on cache hits, misses, latency, accuracy metrics, and internal vCache statistics. diff --git a/tests/ReadMe.md b/tests/ReadMe.md index 2eea550..60ebd01 100644 --- a/tests/ReadMe.md +++ b/tests/ReadMe.md @@ -14,7 +14,7 @@ Reliable and Efficient Semantic Prompt Caching
-## ๐Ÿงช Tests +## Tests vCache includes both **unit tests** and **integration tests** to ensure correctness and reliability across its modular components. From 219836790891a48804e1990a4c5903920cb8077b Mon Sep 17 00:00:00 2001 From: Luis Gaspar Schroeder Date: Sat, 15 Nov 2025 18:22:41 -0800 Subject: [PATCH 2/4] Added SemBenchmarkCombo to benchmark script --- benchmarks/benchmark.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index e9b000f..c86d783 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -151,7 +151,7 @@ class EmbeddingModel(Enum): E5_LARGE_V2 = ("emb_e5_large_v2", "E5_Large_v2", "float16", 512) E5_LARGE_V2_FT = ("emb_e5_large_v2_ft", "E5_Large_v2", "float16", 512) OPENAI_TEXT_EMBEDDING_SMALL = ( - "emb_openai_text_embedding_small", + "emb_text-embedding-3-small", "text-embedding-3-small", "float16", 1536, @@ -177,7 +177,7 @@ class LargeLanguageModel(Enum): None, ) GPT_4O_MINI = ("response_gpt-4o-mini", "GPT-4o-mini", "float16", None) - GPT_4O_NANO = ("response_gpt-4.1-nano", "GPT-4.1-nano", "float16", None) + GPT_4_1_NANO = ("response_gpt-4.1-nano", "GPT-4.1-nano", "float16", None) GPT_4_1 = ("response_gpt-4.1", "gpt-4.1-2025-04-14", "float16", None) @@ -219,6 +219,8 @@ class Dataset(Enum): SEM_BENCHMARK_ARENA = "vCache/SemBenchmarkLmArena" # HuggingFace: https://huggingface.co/datasets/vCache/SemBenchmarkSearchQueries SEM_BENCHMARK_SEARCH_QUERIES = "vCache/SemBenchmarkSearchQueries" + # HuggingFace: https://huggingface.co/datasets/vCache/SemBenchmarkCombo + SEM_BENCHMARK_COMBO = "vCache/SemBenchmarkCombo" # Example for custom dataset. The path is relative to 'benchmarks/your_datasets/' CUSTOM_EXAMPLE = "your_datasets/your_custom_dataset.parquet" @@ -238,7 +240,7 @@ class GeneratePlotsOnly(Enum): ### Benchmark Config ################################################################################################### ######################################################################################################################## -CONFIDENCE_INTERVALS_ITERATIONS: int = 3 +CONFIDENCE_INTERVALS_ITERATIONS: int = 1 DISABLE_PROGRESS_BAR: bool = False KEEP_SPLIT: int = 100 MAX_VECTOR_DB_CAPACITY: int = 150000 @@ -299,6 +301,26 @@ class GeneratePlotsOnly(Enum): MRUEvictionPolicy(max_size=2000, watermark=0.99, eviction_percentage=0.1), 50, ), + # vCache Paper: Figure X (Third embedding model ablation) + ( + EmbeddingModel.OPENAI_TEXT_EMBEDDING_SMALL, + LargeLanguageModel.GPT_4_1_NANO, + Dataset.SEM_BENCHMARK_ARENA, + GeneratePlotsOnly.NO, + BenchmarkComparisonSimilarityEvaluator(), + MRUEvictionPolicy(max_size=100000, watermark=0.99, eviction_percentage=0.1), + 60000, + ), + # vCache Paper: Figure X (SemBenchmarkCombo) + ( + EmbeddingModel.GTE, + LargeLanguageModel.LLAMA_3_8B, + Dataset.SEM_BENCHMARK_COMBO, + GeneratePlotsOnly.NO, + BenchmarkComparisonSimilarityEvaluator(), + MRUEvictionPolicy(max_size=100000, watermark=0.99, eviction_percentage=0.1), + 27500, + ), ] BASELINES_TO_RUN: List[Baseline] = [ @@ -1433,4 +1455,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file From 700d80d6395e6cf0bdd38b00d788b8100ad2e9db Mon Sep 17 00:00:00 2001 From: Luis Gaspar Schroeder Date: Sat, 15 Nov 2025 18:24:19 -0800 Subject: [PATCH 3/4] Added tau latency logic --- vcache/vcache_policy/strategies/verified.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vcache/vcache_policy/strategies/verified.py b/vcache/vcache_policy/strategies/verified.py index f37289c..8e8539b 100644 --- a/vcache/vcache_policy/strategies/verified.py +++ b/vcache/vcache_policy/strategies/verified.py @@ -3,6 +3,9 @@ import queue import random import threading +import pandas as pd +import time +import datetime from concurrent.futures import ThreadPoolExecutor from enum import Enum from typing import Dict, List, Optional, Tuple @@ -471,6 +474,7 @@ def __init__(self, delta: float): 47: 0.02109, 48: 0.01531, } + self.tau_latencies: List[float] = [] def add_observation_to_metadata( self, similarity_score: float, is_correct: bool, metadata: EmbeddingMetadataObj @@ -517,9 +521,19 @@ def select_action( metadata.t_hat = t_hat metadata.var_t = var_t + start_time = time.time() tau: float = self._get_tau( var_t=var_t, s=similarity_score, t_hat=t_hat, metadata=metadata ) + latency = time.time() - start_time + + # Uncomment this to save the tau latencies to a CSV file + self.tau_latencies.append(latency) + #if len(self.tau_latencies) % 10000 == 0: + # df = pd.DataFrame(self.tau_latencies, columns=['latency']) + # timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + # print(f"Saving tau latencies to CSV: tau_latencies_{timestamp}.csv (First value: {self.tau_latencies[0]:.5f}s)") + # df.to_csv(f'tau_latencies_{timestamp}.csv', index=False) u: float = random.uniform(0, 1) if u <= tau: From 9a1c3f84ae2e7b15fc98eb417bbf7d5548d6cb37 Mon Sep 17 00:00:00 2001 From: Luis Gaspar Schroeder Date: Sat, 15 Nov 2025 18:29:25 -0800 Subject: [PATCH 4/4] Formatting --- benchmarks/benchmark.py | 2 +- vcache/vcache_policy/strategies/verified.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index c86d783..9340b35 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -1455,4 +1455,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/vcache/vcache_policy/strategies/verified.py b/vcache/vcache_policy/strategies/verified.py index 8e8539b..740b6d2 100644 --- a/vcache/vcache_policy/strategies/verified.py +++ b/vcache/vcache_policy/strategies/verified.py @@ -3,9 +3,7 @@ import queue import random import threading -import pandas as pd import time -import datetime from concurrent.futures import ThreadPoolExecutor from enum import Enum from typing import Dict, List, Optional, Tuple @@ -529,7 +527,7 @@ def select_action( # Uncomment this to save the tau latencies to a CSV file self.tau_latencies.append(latency) - #if len(self.tau_latencies) % 10000 == 0: + # if len(self.tau_latencies) % 10000 == 0: # df = pd.DataFrame(self.tau_latencies, columns=['latency']) # timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") # print(f"Saving tau latencies to CSV: tau_latencies_{timestamp}.csv (First value: {self.tau_latencies[0]:.5f}s)")