From ce2ed72f8f0fd386541b8d7c87f2845727d31c6c Mon Sep 17 00:00:00 2001
From: Luis Gaspar Schroeder <luis.gasparschroeder@gmail.com>
Date: Sat, 15 Nov 2025 16:05:02 -0800
Subject: [PATCH 1/4] Added SemBenchmarkCombo to ReadMe

---
 README.md            | 17 +++++++++--------
 benchmarks/ReadMe.md | 11 ++++++-----
 tests/ReadMe.md      |  2 +-
 3 files changed, 16 insertions(+), 14 deletions(-)
diff --git a/README.md b/README.md
index 79b705e..c9e049f 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ vCache is the first semantic prompt cache that guarantees user-defined error rat
 > vCache uses OpenAI by default for both LLM inference and embedding generation, but you can configure any other inference setup.
 
 
-## 🚀 Quick Install
+## Quick Install
 
 Install vCache in editable mode:
 
@@ -66,10 +66,10 @@ print(response)
 ```
 
 
-## 🎬 How vCache Works
+## How vCache Works
 
 vCache intelligently detects when a new prompt is semantically equivalent to a cached one, and adapts its decision boundaries based on your accuracy requirements.
-This lets it return cached model responses for semantically similar prompts—not just exact matches—reducing both inference latency and cost without sacrificing correctness.
+This lets it return cached model responses for semantically similar prompts (not just exact matches) reducing both inference latency and cost without sacrificing correctness.
 
 <p align="left">
   <img src="docs/VCacheVisualizer.gif" alt="vCache Visualization" width="60%">
@@ -95,7 +95,7 @@ Applications can range from agentic systems and RAG pipelines to database system
 
 
 
-## ⚙️ Advanced Configuration
+## Advanced Configuration
 
 > [NOTE]
 > vCache is currently in active development. Features and APIs may change as we continue to improve the system.
@@ -160,13 +160,13 @@ vCache supports FIFO, LRU, MRU, and a custom SCU eviction policy. See the [Evict
 
 
 
-## 🛠 Developer Guide
+## Developer Guide
 
 For development setup and contribution guidelines, see [CONTRIBUTING.md](CONTRIBUTING.md).
 
 
 
-## 📊 Benchmarking vCache
+## Benchmarking vCache
 
 vCache includes a benchmarking framework to evaluate:
 - **Cache hit rate**
@@ -174,14 +174,15 @@ vCache includes a benchmarking framework to evaluate:
 - **Latency improvement**
 - **...**
 
-We provide three open benchmarks:
+We provide four open benchmarks:
 - **SemCacheLmArena** (chat-style prompts) - [Dataset  ↗](https://huggingface.co/datasets/vCache/SemBenchmarkLmArena)
 - **SemCacheClassification** (classification queries) - [Dataset  ↗](https://huggingface.co/datasets/vCache/SemBenchmarkClassification)
 - **SemCacheSearchQueries** (real-world search logs) - [Dataset  ↗](https://huggingface.co/datasets/vCache/SemBenchmarkSearchQueries)
+- **vCache/SemBenchmarkCombo** (combines SemBenchmarkLmArena with SemBenchmarkSearchQueries with no-cache-hit scenarios) - [Dataset ↗](https://huggingface.co/datasets/vCache/SemBenchmarkCombo)
 
 See the [Benchmarking Documentation](benchmarks/ReadMe.md) for instructions.
 
-## 📄 Citation
+## Citation
 
 If you use vCache for your research, please cite our [paper](https://arxiv.org/abs/2502.03771).
 
diff --git a/benchmarks/ReadMe.md b/benchmarks/ReadMe.md
index b56712e..238ec59 100644
--- a/benchmarks/ReadMe.md
+++ b/benchmarks/ReadMe.md
@@ -20,7 +20,7 @@ This directory provides the official benchmarking tools for evaluating the perfo
 
 
 
-## ⚙️ Installation
+## Installation
 
 To enable benchmarking capabilities, install vCache with the `benchmarks` extras from the project root:
 
@@ -29,7 +29,7 @@ pip install -e .[benchmarks]
 ```
 
 
-## 🚀 Running Benchmarks
+## Running Benchmarks
 
 Run the main benchmarking script from the project root:
 
@@ -40,7 +40,7 @@ python benchmarks/benchmark.py
 The script will automatically download the required datasets from Hugging Face based on the configurations in `RUN_COMBINATIONS`.
 
 
-## ⚙️ Custom Configuration
+## Custom Configuration
 
 The primary configuration is done by modifying the global variables in the `benchmarks/benchmark.py` script. This script is designed to benchmark the performance of vCache against several baselines by evaluating cache hit rates, accuracy, latency, and other metrics.
 
@@ -64,7 +64,7 @@ Refer to the docstring in `benchmarks/benchmark.py` for more details on other co
 
 
 
-## 📁 Datasets
+## Datasets
 
 ### vCache Datasets
 
@@ -73,6 +73,7 @@ The official benchmark datasets are hosted on Hugging Face and will be downloade
 - **`vCache/SemBenchmarkLmArena`** (chat-style prompts): [Dataset ↗](https://huggingface.co/datasets/vCache/SemBenchmarkLmArena)
 - **`vCache/SemBenchmarkClassification`** (structured queries): [Dataset ↗](https://huggingface.co/datasets/vCache/SemBenchmarkClassification)
 - **`vCache/SemBenchmarkSearchQueries`** (real-world browser searches): [Dataset ↗](https://huggingface.co/datasets/vCache/SemBenchmarkSearchQueries)
+- **`vCache/SemBenchmarkCombo`** (combines SemBenchmarkLmArena with SemBenchmarkSearchQueries with no-cache-hit scenarios): [Dataset ↗](https://huggingface.co/datasets/vCache/SemBenchmarkCombo)
 
 
 ### Custom Datasets
@@ -120,7 +121,7 @@ You can benchmark vCache on your own datasets. The script supports `.csv` and `.
     ```
 
 
-## 📦 Output
+## Output
 
 Benchmark results are saved to the `benchmarks/results/` directory, organized by dataset, embedding model, and LLM. For each run, the output includes:
 - **JSON files** containing raw data on cache hits, misses, latency, accuracy metrics, and internal vCache statistics.
diff --git a/tests/ReadMe.md b/tests/ReadMe.md
index 2eea550..60ebd01 100644
--- a/tests/ReadMe.md
+++ b/tests/ReadMe.md
@@ -14,7 +14,7 @@ Reliable and Efficient Semantic Prompt Caching
 </h3>
 <br>
 
-## 🧪 Tests
+## Tests
 
 vCache includes both **unit tests** and **integration tests** to ensure correctness and reliability across its modular components.
 

From 219836790891a48804e1990a4c5903920cb8077b Mon Sep 17 00:00:00 2001
From: Luis Gaspar Schroeder <luis.gasparschroeder@gmail.com>
Date: Sat, 15 Nov 2025 18:22:41 -0800
Subject: [PATCH 2/4] Added SemBenchmarkCombo to benchmark script

---
 benchmarks/benchmark.py | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
index e9b000f..c86d783 100644
--- a/benchmarks/benchmark.py
+++ b/benchmarks/benchmark.py
@@ -151,7 +151,7 @@ class EmbeddingModel(Enum):
     E5_LARGE_V2 = ("emb_e5_large_v2", "E5_Large_v2", "float16", 512)
     E5_LARGE_V2_FT = ("emb_e5_large_v2_ft", "E5_Large_v2", "float16", 512)
     OPENAI_TEXT_EMBEDDING_SMALL = (
-        "emb_openai_text_embedding_small",
+        "emb_text-embedding-3-small",
         "text-embedding-3-small",
         "float16",
         1536,
@@ -177,7 +177,7 @@ class LargeLanguageModel(Enum):
         None,
     )
     GPT_4O_MINI = ("response_gpt-4o-mini", "GPT-4o-mini", "float16", None)
-    GPT_4O_NANO = ("response_gpt-4.1-nano", "GPT-4.1-nano", "float16", None)
+    GPT_4_1_NANO = ("response_gpt-4.1-nano", "GPT-4.1-nano", "float16", None)
     GPT_4_1 = ("response_gpt-4.1", "gpt-4.1-2025-04-14", "float16", None)
 
 
@@ -219,6 +219,8 @@ class Dataset(Enum):
     SEM_BENCHMARK_ARENA = "vCache/SemBenchmarkLmArena"
     # HuggingFace: https://huggingface.co/datasets/vCache/SemBenchmarkSearchQueries
     SEM_BENCHMARK_SEARCH_QUERIES = "vCache/SemBenchmarkSearchQueries"
+    # HuggingFace: https://huggingface.co/datasets/vCache/SemBenchmarkCombo
+    SEM_BENCHMARK_COMBO = "vCache/SemBenchmarkCombo"
     # Example for custom dataset. The path is relative to 'benchmarks/your_datasets/'
     CUSTOM_EXAMPLE = "your_datasets/your_custom_dataset.parquet"
 
@@ -238,7 +240,7 @@ class GeneratePlotsOnly(Enum):
 ### Benchmark Config ###################################################################################################
 ########################################################################################################################
 
-CONFIDENCE_INTERVALS_ITERATIONS: int = 3
+CONFIDENCE_INTERVALS_ITERATIONS: int = 1
 DISABLE_PROGRESS_BAR: bool = False
 KEEP_SPLIT: int = 100
 MAX_VECTOR_DB_CAPACITY: int = 150000
@@ -299,6 +301,26 @@ class GeneratePlotsOnly(Enum):
         MRUEvictionPolicy(max_size=2000, watermark=0.99, eviction_percentage=0.1),
         50,
     ),
+    # vCache Paper: Figure X (Third embedding model ablation)
+    (
+        EmbeddingModel.OPENAI_TEXT_EMBEDDING_SMALL,
+        LargeLanguageModel.GPT_4_1_NANO,
+        Dataset.SEM_BENCHMARK_ARENA,
+        GeneratePlotsOnly.NO,
+        BenchmarkComparisonSimilarityEvaluator(),
+        MRUEvictionPolicy(max_size=100000, watermark=0.99, eviction_percentage=0.1),
+        60000,
+    ),
+    # vCache Paper: Figure X (SemBenchmarkCombo)
+    (
+        EmbeddingModel.GTE,
+        LargeLanguageModel.LLAMA_3_8B,
+        Dataset.SEM_BENCHMARK_COMBO,
+        GeneratePlotsOnly.NO,
+        BenchmarkComparisonSimilarityEvaluator(),
+        MRUEvictionPolicy(max_size=100000, watermark=0.99, eviction_percentage=0.1),
+        27500,
+    ),
 ]
 
 BASELINES_TO_RUN: List[Baseline] = [
@@ -1433,4 +1455,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From 700d80d6395e6cf0bdd38b00d788b8100ad2e9db Mon Sep 17 00:00:00 2001
From: Luis Gaspar Schroeder <luis.gasparschroeder@gmail.com>
Date: Sat, 15 Nov 2025 18:24:19 -0800
Subject: [PATCH 3/4] Added tau latency logic

---
 vcache/vcache_policy/strategies/verified.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/vcache/vcache_policy/strategies/verified.py b/vcache/vcache_policy/strategies/verified.py
index f37289c..8e8539b 100644
--- a/vcache/vcache_policy/strategies/verified.py
+++ b/vcache/vcache_policy/strategies/verified.py
@@ -3,6 +3,9 @@
 import queue
 import random
 import threading
+import pandas as pd
+import time
+import datetime
 from concurrent.futures import ThreadPoolExecutor
 from enum import Enum
 from typing import Dict, List, Optional, Tuple
@@ -471,6 +474,7 @@ def __init__(self, delta: float):
             47: 0.02109,
             48: 0.01531,
         }
+        self.tau_latencies: List[float] = []
 
     def add_observation_to_metadata(
         self, similarity_score: float, is_correct: bool, metadata: EmbeddingMetadataObj
@@ -517,9 +521,19 @@ def select_action(
         metadata.t_hat = t_hat
         metadata.var_t = var_t
 
+        start_time = time.time()
         tau: float = self._get_tau(
             var_t=var_t, s=similarity_score, t_hat=t_hat, metadata=metadata
         )
+        latency = time.time() - start_time
+
+        # Uncomment this to save the tau latencies to a CSV file
+        self.tau_latencies.append(latency)
+        #if len(self.tau_latencies) % 10000 == 0:
+        #    df = pd.DataFrame(self.tau_latencies, columns=['latency'])
+        #    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        #    print(f"Saving tau latencies to CSV: tau_latencies_{timestamp}.csv (First value: {self.tau_latencies[0]:.5f}s)")
+        #    df.to_csv(f'tau_latencies_{timestamp}.csv', index=False)
 
         u: float = random.uniform(0, 1)
         if u <= tau:

From 9a1c3f84ae2e7b15fc98eb417bbf7d5548d6cb37 Mon Sep 17 00:00:00 2001
From: Luis Gaspar Schroeder <luis.gasparschroeder@gmail.com>
Date: Sat, 15 Nov 2025 18:29:25 -0800
Subject: [PATCH 4/4] Formatting

---
 benchmarks/benchmark.py                     | 2 +-
 vcache/vcache_policy/strategies/verified.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
index c86d783..9340b35 100644
--- a/benchmarks/benchmark.py
+++ b/benchmarks/benchmark.py
@@ -1455,4 +1455,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/vcache/vcache_policy/strategies/verified.py b/vcache/vcache_policy/strategies/verified.py
index 8e8539b..740b6d2 100644
--- a/vcache/vcache_policy/strategies/verified.py
+++ b/vcache/vcache_policy/strategies/verified.py
@@ -3,9 +3,7 @@
 import queue
 import random
 import threading
-import pandas as pd
 import time
-import datetime
 from concurrent.futures import ThreadPoolExecutor
 from enum import Enum
 from typing import Dict, List, Optional, Tuple
@@ -529,7 +527,7 @@ def select_action(
 
         # Uncomment this to save the tau latencies to a CSV file
         self.tau_latencies.append(latency)
-        #if len(self.tau_latencies) % 10000 == 0:
+        # if len(self.tau_latencies) % 10000 == 0:
         #    df = pd.DataFrame(self.tau_latencies, columns=['latency'])
         #    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
         #    print(f"Saving tau latencies to CSV: tau_latencies_{timestamp}.csv (First value: {self.tau_latencies[0]:.5f}s)")