diff --git a/ext/crates/algebra/src/algebra/adem_algebra.rs b/ext/crates/algebra/src/algebra/adem_algebra.rs
index 06d6bed313..509135e513 100644
--- a/ext/crates/algebra/src/algebra/adem_algebra.rs
+++ b/ext/crates/algebra/src/algebra/adem_algebra.rs
@@ -6,7 +6,7 @@ use itertools::Itertools;
 use rustc_hash::FxHashMap as HashMap;
 
 use fp::prime::{BinomialIterator, BitflagIterator, ValidPrime};
-use fp::vector::{FpVector, SliceMut};
+use fp::vector::{prelude::*, FpVector, SliceMut};
 use once::OnceVec;
 
 use crate::algebra::combinatorics::{self, MAX_XI_TAU};
diff --git a/ext/crates/algebra/src/algebra/algebra_trait.rs b/ext/crates/algebra/src/algebra/algebra_trait.rs
index e321c38bfd..77857da071 100644
--- a/ext/crates/algebra/src/algebra/algebra_trait.rs
+++ b/ext/crates/algebra/src/algebra/algebra_trait.rs
@@ -1,5 +1,5 @@
 use fp::prime::ValidPrime;
-use fp::vector::{Slice, SliceMut};
+use fp::vector::{prelude::*, Slice, SliceMut};
 
 use std::fmt::Write as _; // Needed for write! macro for String
 
diff --git a/ext/crates/algebra/src/algebra/combinatorics.rs b/ext/crates/algebra/src/algebra/combinatorics.rs
index 20097d6e35..40b9333c79 100644
--- a/ext/crates/algebra/src/algebra/combinatorics.rs
+++ b/ext/crates/algebra/src/algebra/combinatorics.rs
@@ -1,4 +1,4 @@
-use fp::vector::FpVector;
+use fp::vector::{prelude::*, FpVector};
 use once::OnceVec;
 
 use fp::prime::{minus_one_to_the_n, Binomial, ValidPrime};
@@ -9,7 +9,7 @@ pub const MAX_XI_TAU: usize = MAX_MULTINOMIAL_LEN;
 /// If p is the nth prime, then `XI_DEGREES[n][i - 1]` is the degree of $ξ_i$ at the prime p divided by
 /// q, where q = 2p - 2 if p != 2 and 1 if p = 2.
 const XI_DEGREES: [[i32; MAX_XI_TAU]; NUM_PRIMES] = {
-    let mut res = [[0; 10]; 8];
+    let mut res = [[0; 10]; NUM_PRIMES];
     const_for! { p_idx in 0 .. NUM_PRIMES {
         let p = PRIMES[p_idx];
         let mut p_to_the_i = p;
@@ -26,7 +26,7 @@ const XI_DEGREES: [[i32; MAX_XI_TAU]; NUM_PRIMES] = {
 /// If p is the nth prime, then `TAU_DEGREES[n][i]` is the degree of $τ_i$ at the prime p. Its value is
 /// nonsense at the prime 2
 const TAU_DEGREES: [[i32; MAX_XI_TAU]; NUM_PRIMES] = {
-    let mut res = [[0; 10]; 8];
+    let mut res = [[0; 10]; NUM_PRIMES];
     const_for! { p_idx in 0 .. NUM_PRIMES {
         let p = PRIMES[p_idx];
         let mut p_to_the_i: u32 = 1;
diff --git a/ext/crates/algebra/src/algebra/field.rs b/ext/crates/algebra/src/algebra/field.rs
index 3c449bfd74..3f609265f7 100644
--- a/ext/crates/algebra/src/algebra/field.rs
+++ b/ext/crates/algebra/src/algebra/field.rs
@@ -2,7 +2,7 @@
 
 use crate::algebra::{Algebra, Bialgebra};
 use fp::prime::ValidPrime;
-use fp::vector::{Slice, SliceMut};
+use fp::vector::{prelude::*, Slice, SliceMut};
 
 /// $\mathbb{F}_p$, viewed as an [`Algebra`] over itself.
 ///
diff --git a/ext/crates/algebra/src/algebra/milnor_algebra.rs b/ext/crates/algebra/src/algebra/milnor_algebra.rs
index 647ad6ba26..0325dacdc8 100644
--- a/ext/crates/algebra/src/algebra/milnor_algebra.rs
+++ b/ext/crates/algebra/src/algebra/milnor_algebra.rs
@@ -5,7 +5,7 @@ use std::cell::Cell;
 use crate::algebra::combinatorics;
 use crate::algebra::{Algebra, Bialgebra, GeneratedAlgebra, UnstableAlgebra};
 use fp::prime::{factor_pk, integer_power, Binomial, BitflagIterator, ValidPrime};
-use fp::vector::{FpVector, Slice, SliceMut};
+use fp::vector::{prelude::*, FpVector, Slice, SliceMut};
 use once::OnceVec;
 
 #[cfg(feature = "json")]
diff --git a/ext/crates/algebra/src/algebra/pair_algebra.rs b/ext/crates/algebra/src/algebra/pair_algebra.rs
index 0ada4e288c..7302c775ac 100644
--- a/ext/crates/algebra/src/algebra/pair_algebra.rs
+++ b/ext/crates/algebra/src/algebra/pair_algebra.rs
@@ -9,7 +9,7 @@
 use crate::combinatorics;
 use crate::Algebra;
 use fp::prime::TWO;
-use fp::vector::{FpVector, Slice, SliceMut};
+use fp::vector::{prelude::*, FpVector, Slice, SliceMut};
 use rustc_hash::FxHasher;
 
 type HashMap<K, V> = hashbrown::HashMap<K, V, std::hash::BuildHasherDefault<FxHasher>>;
diff --git a/ext/crates/algebra/src/algebra/polynomial_algebra.rs b/ext/crates/algebra/src/algebra/polynomial_algebra.rs
index 783c692798..b308afb1fb 100644
--- a/ext/crates/algebra/src/algebra/polynomial_algebra.rs
+++ b/ext/crates/algebra/src/algebra/polynomial_algebra.rs
@@ -3,7 +3,7 @@ use rustc_hash::FxHashMap as HashMap;
 use std::fmt;
 
 use fp::prime::ValidPrime;
-use fp::vector::{FpVector, SliceMut};
+use fp::vector::{prelude::*, FpVector, SliceMut};
 use once::OnceVec;
 
 use crate::algebra::combinatorics::TruncatedPolynomialMonomialBasis;
diff --git a/ext/crates/algebra/src/module/block_structure.rs b/ext/crates/algebra/src/module/block_structure.rs
index 5c327b3af7..a258bf1ba4 100644
--- a/ext/crates/algebra/src/module/block_structure.rs
+++ b/ext/crates/algebra/src/module/block_structure.rs
@@ -1,5 +1,5 @@
 use bivec::BiVec;
-use fp::vector::{Slice, SliceMut};
+use fp::vector::{prelude::*, Slice, SliceMut};
 use std::ops::Range;
 
 #[derive(Debug)]
diff --git a/ext/crates/algebra/src/module/finite_dimensional_module.rs b/ext/crates/algebra/src/module/finite_dimensional_module.rs
index 77c707f469..42b7efeed0 100644
--- a/ext/crates/algebra/src/module/finite_dimensional_module.rs
+++ b/ext/crates/algebra/src/module/finite_dimensional_module.rs
@@ -1,7 +1,7 @@
 use crate::algebra::Algebra;
 use crate::module::{Module, ZeroModule};
 use bivec::BiVec;
-use fp::vector::{FpVector, SliceMut};
+use fp::vector::{prelude::*, FpVector, SliceMut};
 
 use std::fmt::Write as _;
 use std::sync::Arc;
@@ -178,7 +178,7 @@ impl<A: Algebra> Module for FiniteDimensionalModule<A> {
             return;
         }
         let output = self.action(op_degree, op_index, mod_degree, mod_index);
-        result.add(output.as_slice(), coeff);
+        result.add(output, coeff);
     }
 
     fn max_degree(&self) -> Option<i32> {
diff --git a/ext/crates/algebra/src/module/finitely_presented_module.rs b/ext/crates/algebra/src/module/finitely_presented_module.rs
index 4587d72f4a..9bdde555c2 100644
--- a/ext/crates/algebra/src/module/finitely_presented_module.rs
+++ b/ext/crates/algebra/src/module/finitely_presented_module.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 use crate::algebra::Algebra;
 use crate::module::homomorphism::{FreeModuleHomomorphism, ModuleHomomorphism};
 use crate::module::{FreeModule, Module, ZeroModule};
-use fp::vector::{FpVector, SliceMut};
+use fp::vector::{prelude::*, FpVector, SliceMut};
 use once::OnceBiVec;
 
 #[cfg(feature = "json")]
@@ -233,7 +233,7 @@ impl<A: Algebra> Module for FinitelyPresentedModule<A> {
         );
         let image = self.map.image(out_deg).unwrap();
         image.reduce(temp_vec.as_slice_mut());
-        for i in 0..result.as_slice().len() {
+        for i in 0..result.len() {
             let value = temp_vec.entry(self.fp_idx_to_gen_idx(out_deg, i));
             result.add_basis_element(i, value);
         }
diff --git a/ext/crates/algebra/src/module/free_module.rs b/ext/crates/algebra/src/module/free_module.rs
index b9e270e542..a213338dca 100644
--- a/ext/crates/algebra/src/module/free_module.rs
+++ b/ext/crates/algebra/src/module/free_module.rs
@@ -2,7 +2,7 @@ use std::sync::Arc;
 
 use crate::algebra::MuAlgebra;
 use crate::module::{Module, ZeroModule};
-use fp::vector::{Slice, SliceMut};
+use fp::vector::{prelude::*, Slice, SliceMut};
 use once::{OnceBiVec, OnceVec};
 
 #[derive(Clone, Debug)]
diff --git a/ext/crates/algebra/src/module/hom_module.rs b/ext/crates/algebra/src/module/hom_module.rs
index 9e6aa944f0..309279ed32 100644
--- a/ext/crates/algebra/src/module/hom_module.rs
+++ b/ext/crates/algebra/src/module/hom_module.rs
@@ -5,7 +5,7 @@ use bivec::BiVec;
 use crate::algebra::Field;
 use crate::module::block_structure::BlockStructure;
 use crate::module::{FreeModule, Module};
-use fp::vector::SliceMut;
+use fp::vector::{prelude::*, SliceMut};
 use once::OnceBiVec;
 
 /// Given a module N and a free module M, this is the module Hom(M, N) as a module over the ground
diff --git a/ext/crates/algebra/src/module/homomorphism/free_module_homomorphism.rs b/ext/crates/algebra/src/module/homomorphism/free_module_homomorphism.rs
index 22780c3fd0..925eb333b9 100644
--- a/ext/crates/algebra/src/module/homomorphism/free_module_homomorphism.rs
+++ b/ext/crates/algebra/src/module/homomorphism/free_module_homomorphism.rs
@@ -5,7 +5,7 @@ use crate::module::free_module::OperationGeneratorPair;
 use crate::module::homomorphism::{ModuleHomomorphism, ZeroHomomorphism};
 use crate::module::{Module, MuFreeModule};
 use fp::matrix::{MatrixSliceMut, QuasiInverse, Subspace};
-use fp::vector::{FpVector, Slice, SliceMut};
+use fp::vector::{prelude::*, FpVector, Slice, SliceMut};
 use once::OnceBiVec;
 
 pub type FreeModuleHomomorphism<M> = MuFreeModuleHomomorphism<false, M>;
@@ -55,10 +55,7 @@ where
         assert!(input_degree >= self.source.min_degree());
         assert!(input_index < self.source.dimension(input_degree));
         let output_degree = input_degree - self.degree_shift;
-        assert_eq!(
-            self.target.dimension(output_degree),
-            result.as_slice().len()
-        );
+        assert_eq!(self.target.dimension(output_degree), result.len());
         let OperationGeneratorPair {
             operation_degree,
             generator_degree,
@@ -191,7 +188,6 @@ where
         }
         for (i, new_output) in new_outputs.iter_mut().enumerate() {
             new_output
-                .as_slice_mut()
                 .assign(outputs_vectors.slice(target_dimension * i, target_dimension * (i + 1)));
         }
         self.outputs.push_checked(new_outputs, degree);
@@ -213,7 +209,7 @@ where
             return;
         }
         for (i, new_output) in new_outputs.iter_mut().enumerate() {
-            new_output.as_slice_mut().assign(matrix.row(i));
+            new_output.assign(matrix.row(i));
         }
         self.outputs.push_checked(new_outputs, degree);
     }
diff --git a/ext/crates/algebra/src/module/homomorphism/full_module_homomorphism.rs b/ext/crates/algebra/src/module/homomorphism/full_module_homomorphism.rs
index e87a1690e8..6795ee50c7 100644
--- a/ext/crates/algebra/src/module/homomorphism/full_module_homomorphism.rs
+++ b/ext/crates/algebra/src/module/homomorphism/full_module_homomorphism.rs
@@ -5,7 +5,7 @@ use crate::module::homomorphism::{IdentityHomomorphism, ModuleHomomorphism, Zero
 use crate::module::Module;
 use bivec::BiVec;
 use fp::matrix::{Matrix, QuasiInverse, Subspace};
-use fp::vector::SliceMut;
+use fp::vector::{prelude::*, SliceMut};
 use once::OnceBiVec;
 
 /// A ModuleHomomorphism that simply records the matrix of the homomorphism in every degree.
@@ -62,7 +62,7 @@ impl<S: Module, T: Module<Algebra = S::Algebra>> ModuleHomomorphism
     ) {
         let output_degree = input_degree - self.degree_shift;
         if let Some(matrix) = self.matrices.get(output_degree) {
-            result.add(matrix[input_idx].as_slice(), coeff);
+            result.add(&matrix[input_idx], coeff);
         }
     }
 
diff --git a/ext/crates/algebra/src/module/homomorphism/hom_pullback.rs b/ext/crates/algebra/src/module/homomorphism/hom_pullback.rs
index 2aea6cb23d..c4966c7096 100644
--- a/ext/crates/algebra/src/module/homomorphism/hom_pullback.rs
+++ b/ext/crates/algebra/src/module/homomorphism/hom_pullback.rs
@@ -5,7 +5,7 @@ use crate::module::homomorphism::{FreeModuleHomomorphism, ModuleHomomorphism};
 use crate::module::HomModule;
 use crate::module::{FreeModule, Module};
 use fp::matrix::{QuasiInverse, Subspace};
-use fp::vector::SliceMut;
+use fp::vector::{prelude::*, SliceMut};
 use once::OnceBiVec;
 
 /// Given a map $\mathtt{map}: A \to B$ and hom modules $\mathtt{source} = \Hom(B, X)$, $\mathtt{target} = \Hom(A, X)$, produce the induced pullback map $\Hom(B, X) \to \Hom(A, X)$.
diff --git a/ext/crates/algebra/src/module/homomorphism/mod.rs b/ext/crates/algebra/src/module/homomorphism/mod.rs
index e610d35c14..3e711474b0 100644
--- a/ext/crates/algebra/src/module/homomorphism/mod.rs
+++ b/ext/crates/algebra/src/module/homomorphism/mod.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 use crate::module::Module;
 use fp::matrix::{AugmentedMatrix, Matrix, MatrixSliceMut, QuasiInverse, Subspace};
 use fp::prime::ValidPrime;
-use fp::vector::{Slice, SliceMut};
+use fp::vector::{prelude::*, Slice, SliceMut};
 
 #[cfg(feature = "concurrent")]
 use rayon::prelude::*;
diff --git a/ext/crates/algebra/src/module/homomorphism/quotient_homomorphism.rs b/ext/crates/algebra/src/module/homomorphism/quotient_homomorphism.rs
index d77c7d661f..09ad6a15f0 100644
--- a/ext/crates/algebra/src/module/homomorphism/quotient_homomorphism.rs
+++ b/ext/crates/algebra/src/module/homomorphism/quotient_homomorphism.rs
@@ -1,5 +1,5 @@
 use crate::module::{Module, QuotientModule};
-use fp::vector::{FpVector, SliceMut};
+use fp::vector::{prelude::*, FpVector, SliceMut};
 use std::sync::Arc;
 
 use crate::module::homomorphism::ModuleHomomorphism;
diff --git a/ext/crates/algebra/src/module/module_trait.rs b/ext/crates/algebra/src/module/module_trait.rs
index 8f8d4aa251..5dc3f7df0f 100644
--- a/ext/crates/algebra/src/module/module_trait.rs
+++ b/ext/crates/algebra/src/module/module_trait.rs
@@ -3,7 +3,7 @@ use itertools::Itertools;
 use std::sync::Arc;
 
 use fp::prime::ValidPrime;
-use fp::vector::{Slice, SliceMut};
+use fp::vector::{prelude::*, Slice, SliceMut};
 
 use crate::algebra::Algebra;
 
diff --git a/ext/crates/algebra/src/module/quotient_module.rs b/ext/crates/algebra/src/module/quotient_module.rs
index c5204a0f0b..e012e0fd57 100644
--- a/ext/crates/algebra/src/module/quotient_module.rs
+++ b/ext/crates/algebra/src/module/quotient_module.rs
@@ -1,7 +1,7 @@
 use crate::module::{Module, ZeroModule};
 use bivec::BiVec;
 use fp::matrix::Subspace;
-use fp::vector::{FpVector, Slice, SliceMut};
+use fp::vector::{prelude::*, FpVector, Slice, SliceMut};
 use std::sync::Arc;
 
 /// A quotient of a module truncated below a fix degree.
diff --git a/ext/crates/algebra/src/module/rpn.rs b/ext/crates/algebra/src/module/rpn.rs
index e184c737fd..79fcd4732d 100644
--- a/ext/crates/algebra/src/module/rpn.rs
+++ b/ext/crates/algebra/src/module/rpn.rs
@@ -5,7 +5,7 @@ use crate::algebra::{
 };
 use crate::module::{Module, ZeroModule};
 use fp::prime::{Binomial, TWO};
-use fp::vector::SliceMut;
+use fp::vector::{prelude::*, SliceMut};
 
 use std::sync::Arc;
 
diff --git a/ext/crates/algebra/src/module/tensor_module.rs b/ext/crates/algebra/src/module/tensor_module.rs
index 8c385b9f2a..3f6bec8521 100644
--- a/ext/crates/algebra/src/module/tensor_module.rs
+++ b/ext/crates/algebra/src/module/tensor_module.rs
@@ -5,7 +5,7 @@ use crate::algebra::{Algebra, Bialgebra};
 use crate::module::block_structure::BlockStructure;
 use crate::module::{Module, ZeroModule};
 use fp::prime::minus_one_to_the_n;
-use fp::vector::{FpVector, Slice, SliceMut};
+use fp::vector::{prelude::*, FpVector, Slice, SliceMut};
 
 use std::sync::Arc;
 
diff --git a/ext/crates/algebra/src/steenrod_evaluator.rs b/ext/crates/algebra/src/steenrod_evaluator.rs
index 846d724382..030955f403 100644
--- a/ext/crates/algebra/src/steenrod_evaluator.rs
+++ b/ext/crates/algebra/src/steenrod_evaluator.rs
@@ -3,7 +3,7 @@ use crate::algebra::{AdemAlgebra, Algebra, MilnorAlgebra};
 use crate::milnor_algebra::{MilnorBasisElement, PPartEntry};
 use crate::steenrod_parser::*;
 use fp::prime::ValidPrime;
-use fp::vector::FpVector;
+use fp::vector::{prelude::*, FpVector};
 
 use anyhow::anyhow;
 use std::collections::BTreeMap;
diff --git a/ext/crates/fp/Cargo.toml b/ext/crates/fp/Cargo.toml
index 2acb5897ab..1cb3148c55 100644
--- a/ext/crates/fp/Cargo.toml
+++ b/ext/crates/fp/Cargo.toml
@@ -25,6 +25,7 @@ expect-test = "1.1.0"
 
 [build-dependencies]
 build_const = "0.2.2"
+itertools = "0.10"
 
 [features]
 default = ["odd-primes"]
diff --git a/ext/crates/fp/benches/criterion.rs b/ext/crates/fp/benches/criterion.rs
index fd6d1ccc49..00b8a8913d 100644
--- a/ext/crates/fp/benches/criterion.rs
+++ b/ext/crates/fp/benches/criterion.rs
@@ -1,9 +1,12 @@
-use std::time::Duration;
-
 use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
 use fp::{matrix::Matrix, prime::ValidPrime};
 use rand::Rng;
 
+#[cfg(feature = "odd-primes")]
+static TEST_PRIMES: [u32; 4] = [2, 3, 5, 7];
+#[cfg(not(feature = "odd-primes"))]
+static TEST_PRIMES: [u32; 1] = [2];
+
 fn random_matrix(p: ValidPrime, dimension: usize) -> Matrix {
     Matrix::from_vec(
         p,
@@ -14,10 +17,15 @@ fn random_matrix(p: ValidPrime, dimension: usize) -> Matrix {
 }
 
 fn row_reductions(c: &mut Criterion) {
-    for p in [2, 3, 5, 7].iter() {
+    for p in TEST_PRIMES.iter() {
         let p = ValidPrime::new(*p);
         let mut group = c.benchmark_group(&format!("row_reduce_{}", p));
-        for dimension in [10, 20, 69, 100, 420, 1000] {
+        let sizes = if *p == 2 {
+            vec![10, 20, 69, 100, 420, 1000, 2000, 4000]
+        } else {
+            vec![10, 20, 69, 100, 420, 1000]
+        };
+        for dimension in sizes {
             group.bench_function(&format!("row_reduce_{}_{}", p, dimension), move |b| {
                 b.iter_batched_ref(
                     || random_matrix(p, dimension),
@@ -41,7 +49,7 @@ fn random_vector(p: ValidPrime, dimension: usize) -> Vec<u32> {
 
 criterion_group! {
     name = row_reduction;
-    config = Criterion::default().sample_size(100).measurement_time(Duration::from_secs(100));
+    config = Criterion::default().sample_size(100).measurement_time(std::time::Duration::from_secs(100));
     targets = row_reductions
 }
 
diff --git a/ext/crates/fp/benches/iai.rs b/ext/crates/fp/benches/iai.rs
index 3ec9ba287c..521eb30c83 100644
--- a/ext/crates/fp/benches/iai.rs
+++ b/ext/crates/fp/benches/iai.rs
@@ -117,6 +117,7 @@ fn row_reduce_7_420() {
     row_reduce_p_n(ValidPrime::new(7), 420);
 }
 
+#[cfg(feature = "odd-primes")]
 iai::main!(
     row_reduce_2_10,
     row_reduce_2_20,
@@ -143,3 +144,13 @@ iai::main!(
     row_reduce_7_420,
     row_reduce_7_1000,
 );
+
+#[cfg(not(feature = "odd-primes"))]
+iai::main!(
+    row_reduce_2_10,
+    row_reduce_2_20,
+    row_reduce_2_69,
+    row_reduce_2_100,
+    row_reduce_2_420,
+    row_reduce_2_1000,
+);
diff --git a/ext/crates/fp/build.rs b/ext/crates/fp/build.rs
index 9d25e5a1b3..6ad146664f 100644
--- a/ext/crates/fp/build.rs
+++ b/ext/crates/fp/build.rs
@@ -1,12 +1,25 @@
 use std::io::Error;
 
 use build_const::ConstWriter;
+use itertools::Itertools;
 
 type Limb = u64;
 
 fn main() -> Result<(), Error> {
-    let num_primes = 8;
-    let primes = first_n_primes(num_primes);
+    #[cfg(feature = "odd-primes")]
+    let max_prime: u32 = 7;
+    #[cfg(not(feature = "odd-primes"))]
+    let max_prime: u32 = 2;
+    let primes = primes_up_to_n(max_prime);
+
+    write_constants(&primes)?;
+    write_macros(&primes)?;
+
+    Ok(())
+}
+
+fn write_constants(primes: &[u32]) -> Result<(), Error> {
+    let num_primes = primes.len();
     let max_prime = *primes.last().unwrap();
     let not_a_prime: usize = u32::MAX as usize; // Hack for 32-bit architectures
     let max_multinomial_len = 10;
@@ -22,24 +35,24 @@ fn main() -> Result<(), Error> {
 
     writer.add_raw("/// The number of primes that are supported.");
     writer.add_value("NUM_PRIMES", "usize", num_primes);
-    writer.add_raw("/// The `MAX_PRIME`th prime number. Constructing a `ValidPrime` using any number larger than");
-    writer.add_raw("/// this value will cause a panic.");
+    writer.add_raw(
+        "/// The `NUM_PRIMES`th prime number. Constructing a `ValidPrime` using any number larger \
+         than this value will cause a panic.",
+    );
     writer.add_value("MAX_PRIME", "usize", max_prime);
-    // `NOT_A_PRIME` is never used if odd-primes is disabled.
-    writer.add_raw("#[allow(dead_code)]");
     writer.add_raw(
-        "/// A sentinel value. `PRIME_TO_INDEX_MAP[i] == NOT_A_PRIME` if and only if `i` is not",
+        "/// A sentinel value. `PRIME_TO_INDEX_MAP[i] == NOT_A_PRIME` if and only if `i` is less \
+         than `MAX_PRIME` and not a prime number.",
     );
-    writer.add_raw("/// a prime number.");
+    // `NOT_A_PRIME` is never used if odd-primes is disabled.
+    writer.add_raw("#[allow(dead_code)]");
     writer.add_value("NOT_A_PRIME", "usize", not_a_prime);
     writer.add_value("MAX_MULTINOMIAL_LEN", "usize", max_multinomial_len);
     writer.add_raw("/// An array containing the first `NUM_PRIMES` prime numbers.");
     writer.add_array("PRIMES", "u32", &primes);
     writer.add_raw(
-        "/// For any integer `i` less than or equal to `MAX_PRIME`, `PRIME_TO_INDEX_MAP[i]` is",
-    );
-    writer.add_raw(
-        "/// the index of `i` in `PRIMES` if `i` is prime; otherwise, it is `NOT_A_PRIME`.",
+        "/// For any integer `i` less than or equal to `MAX_PRIME`, `PRIME_TO_INDEX_MAP[i]` is \
+         the index of `i` in `PRIMES` if `i` is prime; otherwise, it is `NOT_A_PRIME`.",
     );
     writer.add_array("PRIME_TO_INDEX_MAP", "usize", &prime_to_index_map);
 
@@ -55,16 +68,270 @@ fn main() -> Result<(), Error> {
     Ok(())
 }
 
-fn first_n_primes(n: usize) -> Vec<u32> {
-    let mut acc = vec![];
-    let mut i = 2;
-    while acc.len() < n {
-        if is_prime(i) {
-            acc.push(i);
-        }
-        i += 1;
-    }
-    acc
+fn write_macros(primes: &[u32]) -> Result<(), Error> {
+    let mut writer = ConstWriter::for_build("macros")?.finish_dependencies();
+
+    // methods taking `self` and `other` by reference
+    let ref_ref = primes
+        .iter()
+        .map(|&p| format!("(Self::_{p}(ref x), $other::_{p}(ref y)) => x.$method(y, $($arg),*),"))
+        .join("\n                ");
+
+    // methods taking `self` by mutable reference and `other` by reference
+    let mut_ref_ref = primes
+        .iter()
+        .map(|&p| {
+            format!("(Self::_{p}(ref mut x), $other::_{p}(ref y)) => x.$method(y, $($arg),*),")
+        })
+        .join("\n                ");
+
+    // methods taking `self` by mutable reference and returning a prime-dependent type
+    let mut_ref_dispatch = primes
+        .iter()
+        .map(|&p| format!("Self::_{p}(ref mut x) => $ret::_{p}(x.$method($($arg),*)),"))
+        .join("\n                ");
+
+    // methods taking self by reference and returning a prime-dependent type
+    let ref_dispatch = primes
+        .iter()
+        .map(|&p| format!("Self::_{p}(ref x) => $ret::_{p}(x.$method($($arg),*)),"))
+        .join("\n                ");
+
+    // methods taking self by value and returning a prime-dependent type
+    let val_dispatch = primes
+        .iter()
+        .map(|&p| format!("Self::_{p}(x) => $ret::_{p}(x.$method($($arg),*)),"))
+        .join("\n                ");
+
+    // methods taking self by mutable reference
+    let mut_ref = primes
+        .iter()
+        .map(|&p| format!("Self::_{p}(ref mut x) => x.$method($($arg),*),"))
+        .join("\n                ");
+
+    // methods taking self by reference
+    let reff = primes
+        .iter()
+        .map(|p| format!("Self::_{p}(ref x) => x.$method($($arg),*),"))
+        .join("\n                ");
+
+    // dispatch prime generic
+    let dispatch_prime_generic = primes
+        .iter()
+        .map(|p| format!("(Self::_{p}(ref mut x), Slice::_{p}(y)) => x.$method(y $(,$arg)*),"))
+        .join("\n                ");
+
+    // generic match_p
+    let match_p = primes
+        .iter()
+        .map(|p| format!("{p} => Self::_{p}($($val)*),"))
+        .join("\n            ");
+
+    // dispatch type
+    let dispatch_type = primes
+        .iter()
+        .map(|p| format!("_{p}($generic<{p}>),"))
+        .join("\n            ");
+
+    // dispatch type with lifetime
+    let dispatch_type_life = primes
+        .iter()
+        .map(|p| format!("_{p}($generic<$life, {p}>),"))
+        .join("\n            ");
+
+    // implement `From` for references
+    let impl_from_ref = primes
+        .iter()
+        .map(|p| format!("$t1::_{p}(x) => $t2::_{p}($t2p::<'a, {p}>::from(x)),"))
+        .join("\n                    ");
+
+    // match p over self
+    let match_self = primes
+        .iter()
+        .map(|p| format!("Self::_{p}(x) => $ret::_{p}(x.$method($($arg),*)),"))
+        .join("\n            ");
+
+    // match p over a triple (self, left, right)
+    let match_self_left_right = primes
+        .iter()
+        .map(|p| {
+            format!("(SliceMut::_{p}(ref mut x), Slice::_{p}(y), Slice::_{p}(z)) => {{ x.$method($($arg),*, y, z) }},")
+        })
+        .join("\n            ");
+
+    // call a macro for all values of p
+    let call_macro = primes
+        .iter()
+        .map(|p| format!("$macro!(_{p}, {p});"))
+        .join("\n        ");
+
+    writer.add_raw(&format!(r#"
+macro_rules! dispatch_prime_inner {{
+    // other is a type, but marking it as a :ty instead of :tt means we cannot use it to access its
+    // enum variants.
+    ($vis:vis fn $method:ident(&self, other: &$other:tt $(, $arg:ident: $ty:ty )* ) $(-> $ret:ty)?) => {{
+        $vis fn $method(&self, other: &$other, $($arg: $ty),* ) $(-> $ret)* {{
+            match (self, other) {{
+                {ref_ref}
+                (l, r) => {{
+                    panic!("Applying {{}} to vectors over different primes ({{}} and {{}})", stringify!($method), l.prime(), r.prime());
+                }}
+            }}
+        }}
+    }};
+    ($vis:vis fn $method:ident(&mut self, other: &$other:tt $(, $arg:ident: $ty:ty )* ) $(-> $ret:ty)?) => {{
+        #[allow(unused_parens)]
+        $vis fn $method(&mut self, other: &$other, $($arg: $ty),* ) $(-> $ret)* {{
+            match (self, other) {{
+                {mut_ref_ref}
+                (l, r) => {{
+                    panic!("Applying {{}} to vectors over different primes ({{}} and {{}})", stringify!($method), l.prime(), r.prime());
+                }}
+            }}
+        }}
+    }};
+    ($vis:vis fn $method:ident(&mut self $(, $arg:ident: $ty:ty )* ) -> (dispatch $ret:tt)) => {{
+        $vis fn $method(&mut self, $($arg: $ty),* ) -> $ret {{
+            match self {{
+                {mut_ref_dispatch}
+            }}
+        }}
+    }};
+    ($vis:vis fn $method:ident(&self $(, $arg:ident: $ty:ty )* ) -> (dispatch $ret:tt)) => {{
+        $vis fn $method(&self, $($arg: $ty),* ) -> $ret {{
+            match self {{
+                {ref_dispatch}
+            }}
+        }}
+    }};
+    ($vis:vis fn $method:ident(self $(, $arg:ident: $ty:ty )* ) -> (dispatch $ret:tt)) => {{
+        $vis fn $method(self, $($arg: $ty),* ) -> $ret {{
+            match self {{
+                {val_dispatch}
+            }}
+        }}
+    }};
+    ($vis:vis fn $method:ident(&mut self $(, $arg:ident: $ty:ty )* ) $(-> $ret:ty)?) => {{
+        #[allow(unused_parens)]
+        $vis fn $method(&mut self, $($arg: $ty),* ) $(-> $ret)* {{
+            match self {{
+                {mut_ref}
+            }}
+        }}
+    }};
+    ($vis:vis fn $method:ident(&self $(, $arg:ident: $ty:ty )* ) $(-> $ret:ty)?) => {{
+        #[allow(unused_parens)]
+        $vis fn $method(&self, $($arg: $ty),* ) $(-> $ret)* {{
+            match self {{
+                {reff}
+            }}
+        }}
+    }};
+}}
+
+macro_rules! dispatch_prime {{
+    () => {{}};
+    ($vis:vis fn $method:ident $tt:tt $(-> $ret:tt)?; $($tail:tt)*) => {{
+        dispatch_prime_inner! {{
+            $vis fn $method $tt $(-> $ret)*
+        }}
+        dispatch_prime!{{$($tail)*}}
+    }};
+}}
+
+macro_rules! dispatch_prime_generic_inner {{
+    (fn $method:ident(&mut self $(, $arg:ident: $ty:ty )*) $(-> $ret:ty)?) => {{
+        fn $method<'b, T: Into<Slice<'b>>>(&mut self, other: T $(,$arg: $ty)*) $(-> $ret)? {{
+            match (self, other.into()) {{
+                {dispatch_prime_generic}
+                (l, r) => panic!(
+                    "Applying add to vectors over different primes ({{}} and {{}})",
+                    l.prime(),
+                    r.prime()
+                ),
+            }}
+        }}
+    }}
+}}
+
+/// Macro to implement the generic addition methods.
+macro_rules! dispatch_prime_generic {{
+    () => {{}};
+    (fn $method:ident(&mut self $(, $arg:ident: $ty:ty )*) $(-> $ret:ty)?; $($tail:tt)*) => {{
+        dispatch_prime_generic_inner! {{
+            fn $method(&mut self $(, $arg: $ty )*) $(-> $ret)?
+        }}
+        dispatch_prime_generic!{{$($tail)*}}
+    }}
+}}
+
+macro_rules! dispatch_type {{
+    (derive($($derive_macro:tt)*), $vis:vis $special:ident {{ $generic:ident }}) => {{
+        #[derive($($derive_macro)*)]
+        $vis enum $special {{
+            {dispatch_type}
+        }}
+    }};
+    (derive($($derive_macro:tt)*), $vis:vis $special:ident<$life:lifetime> {{ $generic:ident }}) => {{
+        #[derive($($derive_macro)*)]
+        $vis enum $special<$life> {{
+            {dispatch_type_life}
+        }}
+    }};
+}}
+
+macro_rules! impl_from_ref {{
+    ($t1:tt, $t2:tt, $t2p:tt $(, $m:tt)?) => {{
+        impl<'a, 'b> From<&'a $($m)* $t1<'b>> for $t2<'a> {{
+            fn from(slice: &'a $($m)* $t1<'b>) -> $t2<'a> {{
+                match slice {{
+                    {impl_from_ref}
+                }}
+            }}
+        }}
+    }};
+}}
+
+macro_rules! match_self_p {{
+    ($method:ident(&$selff:ident $(, $arg:ident)*) -> $ret:tt) => {{
+        match $selff {{
+            {match_self}
+        }}
+    }};
+}}
+
+macro_rules! match_self_left_right_p {{
+    ($method:ident(&mut $selff:ident $(, $arg:ident)*; $left:ident, $right:ident )) => {{
+        match ($selff, $left, $right) {{
+            {match_self_left_right}
+            _ => {{
+                panic!(concat!("Applying method to vectors over different primes"));
+            }}
+        }}
+    }};
+}}
+
+macro_rules! match_p {{
+    ($p:ident, $($val:tt)*) => {{
+        match *$p {{
+            {match_p}
+            _ => panic!("Prime not supported: {{}}", *$p)
+        }}
+    }};
+}}
+
+macro_rules! call_macro_p {{
+    ($macro:ident) => {{
+        {call_macro}
+    }};
+}}"#,
+    ));
+
+    Ok(())
+}
+
+fn primes_up_to_n(n: u32) -> Vec<u32> {
+    (2..=n).filter(|&i| is_prime(i)).collect()
 }
 
 fn is_prime(i: u32) -> bool {
diff --git a/ext/crates/fp/src/constants.rs b/ext/crates/fp/src/constants.rs
index 74d17940aa..20963a8f86 100644
--- a/ext/crates/fp/src/constants.rs
+++ b/ext/crates/fp/src/constants.rs
@@ -4,7 +4,7 @@ build_const!("constants");
 
 #[macro_export]
 macro_rules! const_for {
-    ($i:ident in $a:literal .. $b:ident $contents:block) => {
+    ($i:ident in $a:literal.. $b:ident $contents:block) => {
         let mut $i = $a;
         while $i < $b {
             $contents;
diff --git a/ext/crates/fp/src/lib.rs b/ext/crates/fp/src/lib.rs
index 62d75dc456..95187ce06b 100644
--- a/ext/crates/fp/src/lib.rs
+++ b/ext/crates/fp/src/lib.rs
@@ -1,17 +1,16 @@
+#![allow(unused_macros)] // For when odd-primes is disabled
+
 mod constants;
 mod limb;
 
 pub use constants::{MAX_MULTINOMIAL_LEN, NUM_PRIMES, PRIMES, PRIME_TO_INDEX_MAP};
 
+#[macro_use]
+pub(crate) mod macros;
+
 pub mod matrix;
 pub mod prime;
-#[cfg(feature = "odd-primes")]
 pub mod vector;
-pub mod vector_2;
-#[cfg(not(feature = "odd-primes"))]
-pub use vector_2 as vector;
-
-pub mod vector_inner;
 
 pub(crate) mod simd;
 
diff --git a/ext/crates/fp/src/limb.rs b/ext/crates/fp/src/limb.rs
index 41e5a5e7c7..11dff98b4b 100644
--- a/ext/crates/fp/src/limb.rs
+++ b/ext/crates/fp/src/limb.rs
@@ -1,8 +1,7 @@
 use std::ops::Range;
 
 pub(crate) use crate::constants::Limb;
-
-use crate::{constants::BITS_PER_LIMB, prime::ValidPrime};
+use crate::{constants::BITS_PER_LIMB, prime::ValidPrime, simd};
 
 /// A struct containing the information required to access a specific entry in an array of `Limb`s.
 #[derive(Copy, Clone)]
@@ -11,6 +10,152 @@ pub(crate) struct LimbBitIndexPair {
     pub(crate) bit_index: usize,
 }
 
+/// A struct that defines a range of entries in a slice of limbs.
+#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
+pub struct LimbLength<const P: u32> {
+    /// The index of the first entry. We do not assume that this value is less than
+    /// `entries_per_limb_const::<P>()` in general, but some functions require it. See
+    /// [`apply_shift`].
+    pub(crate) start: usize,
+
+    /// The index of the last entry.
+    pub(crate) end: usize,
+
+    /// The total number of limbs in the range.
+    ///
+    /// We store this value instead of computing it on the fly because benchmarks tend to show that
+    /// the tradeoff is beneficial in high dimensions (>1000). We might want to only enable this
+    /// when odd-primes is enabled, since computing this number is easier when `p == 2`, so the
+    /// tradeoff is potentially worse.
+    limbs: usize,
+}
+
+impl<const P: u32> LimbLength<P> {
+    pub(crate) const fn from_logical(logical: usize) -> Self {
+        let limbs = number::<P>(logical);
+        Self {
+            start: 0,
+            end: logical,
+            limbs,
+        }
+    }
+
+    /// Returns a `LimbLength` describing a vector starting at entry `start` and ending at entry
+    /// `end`.
+    pub(crate) const fn from_start_end(start: usize, end: usize) -> Self {
+        let limb_range = range::<P>(start, end);
+        Self {
+            start,
+            end,
+            limbs: limb_range.end - limb_range.start,
+        }
+    }
+
+    #[inline]
+    pub(crate) const fn limbs(&self) -> usize {
+        self.limbs
+    }
+
+    #[inline]
+    pub(crate) const fn logical(&self) -> usize {
+        self.end - self.start
+    }
+
+    pub(crate) const fn contains(&self, other: &Self) -> bool {
+        self.start + other.end <= self.end
+    }
+
+    /// Shift the entire `LimbLength` backwards so that the start of the range belongs to the first
+    /// limb, and return it together with the number of limbs shifted.
+    pub(crate) const fn apply_shift(&self) -> (Self, usize) {
+        let entries_per = entries_per_limb_const::<P>();
+        let offset = self.start / entries_per;
+        let start = self.start - offset * entries_per;
+        let end = self.end - offset * entries_per;
+        (LimbLength::from_start_end(start, end), offset)
+    }
+
+    pub(crate) const fn restrict_to(&self, other: Self) -> Self {
+        debug_assert!(self.contains(&other));
+        Self::from_start_end(other.start + self.start, other.end + self.start)
+    }
+
+    /// This function panics if `self.start != 0`. The `LimbLength` that is returned also satisfies
+    /// `self.start == 0`.
+    ///
+    /// It would be possible to make it work if we only assume `self.start %
+    /// entries_per_limb_const::<P>() == 0`, but this introduces slight complications, e.g.
+    /// depending on which of self.start or offset is bigger. While this can be solved by a
+    /// `saturating_sub`, the reason we leave it that way is because we only use it when trimming
+    /// the front of an `FpVector`, where the start is 0 by definition.
+    pub(crate) fn trim_start(&self, offset: usize) -> Self {
+        debug_assert_eq!(self.start, 0);
+        assert_eq!(offset % entries_per_limb_const::<P>(), 0);
+        let limb_shift = offset / entries_per_limb_const::<P>();
+        Self {
+            start: self.start,
+            end: self.end - offset,
+            limbs: self.limbs - limb_shift,
+        }
+    }
+
+    /// This function assumes that `self.start < entries_per_limb_const::<P>()`. A `LimbLength`
+    /// equivalent to `self` that does satisfy this condition can be obtained by calling
+    /// [`apply_shift`].
+    #[inline]
+    pub(crate) const fn bit_offset(&self) -> usize {
+        self.start * bit_length_const::<P>()
+    }
+
+    #[inline]
+    pub(crate) const fn limb_range(&self) -> Range<usize> {
+        range::<P>(self.start, self.end)
+    }
+
+    /// # Panics
+    ///
+    /// This function underflows if `self.start + self.logical() == 0`, which happens if and only if
+    /// we are taking a slice of width 0 at the start of a limb. This should be a very rare edge
+    /// case. Dealing with the underflow properly would probably require using `saturating_sub` or
+    /// something of that nature, and that has a nontrivial (10%) performance hit.
+    #[inline]
+    pub(crate) fn limb_range_inner(&self) -> Range<usize> {
+        let range = self.limb_range();
+        (range.start + 1)..(usize::max(range.start + 1, range.end - 1))
+    }
+
+    /// This function assumes that `self.start < entries_per_limb_const::<P>()`. A `LimbLength`
+    /// equivalent to `self` that does satisfy this condition can be obtained by calling
+    /// [`apply_shift`].
+    #[inline(always)]
+    pub(crate) const fn min_limb_mask(&self) -> Limb {
+        !0 << self.bit_offset()
+    }
+
+    #[inline(always)]
+    pub(crate) const fn max_limb_mask(&self) -> Limb {
+        let num_entries = 1 + (self.end - 1) % entries_per_limb_const::<P>();
+        let bit_max = num_entries * bit_length_const::<P>();
+
+        (!0) >> (BITS_PER_LIMB - bit_max)
+    }
+
+    /// This function assumes that `self.start < entries_per_limb_const::<P>()`. A `LimbLength`
+    /// equivalent to `self` that does satisfy this condition can be obtained by calling
+    /// [`apply_shift`].
+    #[inline(always)]
+    pub(crate) fn limb_masks(&self) -> (Limb, Limb) {
+        if self.limb_range().len() == 1 {
+            (
+                self.min_limb_mask() & self.max_limb_mask(),
+                self.min_limb_mask() & self.max_limb_mask(),
+            )
+        } else {
+            (self.min_limb_mask(), self.max_limb_mask())
+        }
+    }
+}
+
 /// Return the number of bits an element of $\mathbb{F}_P$ occupies in a limb.
 pub(crate) const fn bit_length(p: ValidPrime) -> usize {
     let p = p.value();
@@ -46,6 +191,29 @@ pub(crate) const fn entries_per_limb_const<const P: u32>() -> usize {
     BITS_PER_LIMB / bit_length_const::<P>()
 }
 
+/// This is identical to [`limb::number`], except that it's not const. Hopefully almost every method
+/// in the limb crate can be const once the matrix rewrite is in place.
+pub(crate) const fn num_limbs(p: ValidPrime, len: usize) -> usize {
+    let entries_per_limb = entries_per_limb(p);
+    (len + entries_per_limb - 1) / entries_per_limb
+}
+
+pub(crate) const fn padded_len(p: ValidPrime, len: usize) -> usize {
+    num_limbs(p, len) * entries_per_limb(p)
+}
+
+/// The number of bits that the entries occupy in total. This number is close to [`BITS_PER_LIMB`],
+/// but often slightly lower unless `P == 2`.
+pub(crate) const fn used_bits<const P: u32>() -> usize {
+    entries_per_limb_const::<P>() * bit_length_const::<P>()
+}
+
+/// A mask on the region that contains entries. Limbs are usually assumed to satisfy the condition
+/// `limb & !used_mask() == 0`.
+pub(crate) const fn used_mask<const P: u32>() -> Limb {
+    !0 >> (BITS_PER_LIMB - used_bits::<P>())
+}
+
 pub(crate) const fn limb_bit_index_pair<const P: u32>(idx: usize) -> LimbBitIndexPair {
     LimbBitIndexPair {
         limb: idx / entries_per_limb_const::<P>(),
@@ -63,6 +231,19 @@ pub(crate) const fn add<const P: u32>(limb_a: Limb, limb_b: Limb, coeff: u32) ->
     }
 }
 
+/// Add (`c` times) all of the limbs in `rhs` to the limbs in `lhs`. This is optimized to use SIMD
+/// when `P == 2`.
+pub(crate) fn add_all<const P: u32>(lhs: &mut [Limb], rhs: &[Limb], c: u32) {
+    if P == 2 {
+        simd::add_simd(lhs, rhs, 0);
+    } else {
+        for (left, right) in lhs.iter_mut().zip(rhs) {
+            *left = add::<P>(*left, *right, c);
+            *left = reduce::<P>(*left);
+        }
+    }
+}
+
 /// Return the `Limb` whose entries are the entries of `limb` reduced modulo `P`.
 ///
 /// Contributed by Robert Burklund.
@@ -101,8 +282,7 @@ pub(crate) fn is_reduced<const P: u32>(limb: Limb) -> bool {
     limb == reduce::<P>(limb)
 }
 
-/// Given an interator of `u32`'s, pack all of them into a single limb in order.
-/// It is assumed that
+/// Given an interator of `u32`'s, pack all of them into a single limb in order. It is assumed that
 ///  - The values of the iterator are less than P
 ///  - The values of the iterator fit into a single limb
 ///
@@ -131,13 +311,12 @@ pub(crate) fn unpack<const P: u32>(mut limb: Limb) -> impl Iterator<Item = u32>
     })
 }
 
-/// Return the number of limbs required to hold `dim` entries.
+/// Return the number of limbs required to hold `dim` entries. This is identical to
+/// [`limb::num_limbs`], except the latter is not const. Hopefully almost every method in the limb
+/// crate can be const once the matrix rewrite is in place.
 pub(crate) const fn number<const P: u32>(dim: usize) -> usize {
-    if dim == 0 {
-        0
-    } else {
-        limb_bit_index_pair::<P>(dim - 1).limb + 1
-    }
+    let entries_per_limb = entries_per_limb_const::<P>();
+    (dim + entries_per_limb - 1) / entries_per_limb
 }
 
 /// Return the `Range<usize>` starting at the index of the limb containing the `start`th entry, and
@@ -152,7 +331,7 @@ pub(crate) const fn range<const P: u32>(start: usize, end: usize) -> Range<usize
     min..max
 }
 
-pub(crate) fn sign_rule(mut target: Limb, mut source: Limb) -> u32 {
+pub(crate) const fn sign_rule(mut target: Limb, mut source: Limb) -> u32 {
     let mut result = 0;
     let mut n = 1;
     // Empirically, the compiler unrolls this loop because BITS_PER_LIMB is a constant.
diff --git a/ext/crates/fp/src/macros.rs b/ext/crates/fp/src/macros.rs
new file mode 100644
index 0000000000..c438cbc796
--- /dev/null
+++ b/ext/crates/fp/src/macros.rs
@@ -0,0 +1,3 @@
+use build_const::build_const;
+
+build_const!("macros");
diff --git a/ext/crates/fp/src/matrix/matrix_inner.rs b/ext/crates/fp/src/matrix/matrix_inner.rs
index 85eda3a6e8..b812a0608e 100644
--- a/ext/crates/fp/src/matrix/matrix_inner.rs
+++ b/ext/crates/fp/src/matrix/matrix_inner.rs
@@ -1,7 +1,8 @@
 use super::{QuasiInverse, Subspace};
+use crate::limb;
 use crate::matrix::m4ri::M4riTable;
 use crate::prime::{self, ValidPrime};
-use crate::vector::{FpVector, Slice, SliceMut};
+use crate::vector::{prelude::*, FpVector, Slice, SliceMut};
 
 use std::fmt;
 use std::io::{Read, Write};
@@ -170,6 +171,7 @@ impl Matrix {
     ///
     /// # Example
     /// ```
+    /// # #[cfg(feature="odd-primes")] {
     /// # use fp::prime::ValidPrime;
     /// let p = ValidPrime::new(7);
     /// # use fp::matrix::Matrix;
@@ -177,6 +179,8 @@ impl Matrix {
     ///               vec![0, 3, 4]];
     ///
     /// let m = Matrix::from_vec(p, &input);
+    /// # }
+    /// ```
     pub fn from_vec(p: ValidPrime, input: &[Vec<u32>]) -> Matrix {
         let rows = input.len();
         if rows == 0 {
@@ -204,6 +208,7 @@ impl Matrix {
     ///
     /// # Example
     /// ```
+    /// # #[cfg(feature="odd-primes")] {
     /// # use fp::prime::ValidPrime;
     /// let p = ValidPrime::new(7);
     /// # use fp::matrix::Matrix;
@@ -213,10 +218,12 @@ impl Matrix {
     ///
     /// let (n, m) = Matrix::augmented_from_vec(p, &input);
     /// assert!(n >= input[0].len());
+    /// # }
+    /// ```
     pub fn augmented_from_vec(p: ValidPrime, input: &[Vec<u32>]) -> (usize, Matrix) {
         let rows = input.len();
         let cols = input[0].len();
-        let padded_cols = FpVector::padded_len(p, cols);
+        let padded_cols = limb::padded_len(p, cols);
         let mut m = Matrix::new(p, rows, padded_cols + rows);
 
         for i in 0..rows {
@@ -484,6 +491,7 @@ impl Matrix {
     ///
     /// # Example
     /// ```
+    /// # #[cfg(feature="odd-primes")] {
     /// # use fp::prime::ValidPrime;
     /// let p = ValidPrime::new(7);
     /// # use fp::matrix::Matrix;
@@ -498,6 +506,7 @@ impl Matrix {
     /// m.row_reduce();
     ///
     /// assert_eq!(m, Matrix::from_vec(p, &result));
+    /// # }
     /// ```
     pub fn row_reduce(&mut self) -> usize {
         let p = self.p;
@@ -606,6 +615,7 @@ impl Matrix {
     ///
     /// # Example
     /// ```
+    /// # #[cfg(feature="odd-primes")] {
     /// # use fp::prime::ValidPrime;
     /// let p = ValidPrime::new(3);
     /// # use fp::matrix::Matrix;
@@ -621,6 +631,7 @@ impl Matrix {
     /// let preimage = [vec![0, 1, 0],
     ///                 vec![0, 2, 2]];
     /// assert_eq!(qi.preimage(), &Matrix::from_vec(p, &preimage));
+    /// # }
     /// ```
     pub fn compute_quasi_inverse(
         &self,
@@ -649,6 +660,7 @@ impl Matrix {
     ///
     /// # Example
     /// ```
+    /// # #[cfg(feature="odd-primes")] {
     /// # use fp::prime::ValidPrime;
     /// let p = ValidPrime::new(3);
     /// # use fp::matrix::Matrix;
@@ -666,6 +678,7 @@ impl Matrix {
     ///              vec![0, 1, 1, 0, 1]];
     /// assert_eq!(computed_image.matrix, Matrix::from_vec(p, &image));
     /// assert_eq!(computed_image.pivots(), &vec![0, 1, -1, -1, -1]);
+    /// # }
     /// ```
     pub fn compute_image(&self, last_target_col: usize, first_source_col: usize) -> Subspace {
         let p = self.prime();
@@ -700,6 +713,7 @@ impl Matrix {
     ///
     /// # Example
     /// ```
+    /// # #[cfg(feature="odd-primes")] {
     /// # use fp::prime::ValidPrime;
     /// let p = ValidPrime::new(3);
     /// # use fp::matrix::Matrix;
@@ -714,6 +728,7 @@ impl Matrix {
     ///
     /// let mut target = vec![0; 3];
     /// assert_eq!(Vec::<u32>::from(&ker.matrix[0]), vec![1, 1, 2]);
+    /// # }
     /// ```
     pub fn compute_kernel(&self, first_source_column: usize) -> Subspace {
         let p = self.p;
@@ -840,7 +855,6 @@ impl Matrix {
             new_row
                 .slice_mut(start_column, start_column + desired_image.matrix.columns)
                 .assign(new_image);
-
             self.vectors.push(new_row);
 
             added_pivots.push(i);
@@ -852,10 +866,11 @@ impl Matrix {
     ///
     /// # Example
     /// ```
+    /// # #[cfg(feature="odd-primes")] {
     /// # use fp::prime::ValidPrime;
     /// let p = ValidPrime::new(7);
     /// # use fp::matrix::Matrix;
-    /// # use fp::vector::FpVector;
+    /// # use fp::vector::{prelude::*, FpVector};
     /// let input  = [vec![1, 3, 6],
     ///               vec![0, 3, 4]];
     ///
@@ -865,14 +880,12 @@ impl Matrix {
     /// let desired_result = FpVector::from_slice(p, &vec![3, 5, 1]);
     /// m.apply(result.as_slice_mut(), 1, v.as_slice());
     /// assert_eq!(result, desired_result);
+    /// # }
     /// ```
     pub fn apply(&self, mut result: SliceMut, coeff: u32, input: Slice) {
         debug_assert_eq!(input.len(), self.rows());
         for i in 0..input.len() {
-            result.add(
-                self.vectors[i].as_slice(),
-                (coeff * input.entry(i)) % *self.p,
-            );
+            result.add(&self.vectors[i], (coeff * input.entry(i)) % *self.p);
         }
     }
 
@@ -952,7 +965,7 @@ impl<const N: usize> AugmentedMatrix<N> {
         let mut start = [0; N];
         let mut end = [0; N];
         for i in 1..N {
-            start[i] = start[i - 1] + FpVector::padded_len(p, columns[i - 1]);
+            start[i] = start[i - 1] + limb::padded_len(p, columns[i - 1]);
         }
         for i in 0..N {
             end[i] = start[i] + columns[i];
@@ -975,7 +988,7 @@ impl<const N: usize> AugmentedMatrix<N> {
         let mut start = [0; N];
         let mut end = [0; N];
         for i in 1..N {
-            start[i] = start[i - 1] + FpVector::padded_len(p, columns[i - 1]);
+            start[i] = start[i - 1] + limb::padded_len(p, columns[i - 1]);
         }
         for i in 0..N {
             end[i] = start[i] + columns[i];
diff --git a/ext/crates/fp/src/matrix/quasi_inverse.rs b/ext/crates/fp/src/matrix/quasi_inverse.rs
index 2760151141..121e5e9060 100644
--- a/ext/crates/fp/src/matrix/quasi_inverse.rs
+++ b/ext/crates/fp/src/matrix/quasi_inverse.rs
@@ -1,6 +1,6 @@
 use super::Matrix;
 use crate::prime::ValidPrime;
-use crate::vector::{FpVector, Slice, SliceMut};
+use crate::vector::{prelude::*, FpVector, Slice, SliceMut};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use std::io::{Read, Write};
 
@@ -138,7 +138,7 @@ impl QuasiInverse {
                 }
             }
             if c != 0 {
-                target.add(self.preimage[row].as_slice(), (coeff * c) % *p);
+                target.add(&self.preimage[row], (coeff * c) % *p);
             }
             row += 1;
         }
diff --git a/ext/crates/fp/src/matrix/subquotient.rs b/ext/crates/fp/src/matrix/subquotient.rs
index 0aeca4c323..ea4a3d9d8f 100644
--- a/ext/crates/fp/src/matrix/subquotient.rs
+++ b/ext/crates/fp/src/matrix/subquotient.rs
@@ -1,7 +1,7 @@
 use super::Subspace;
 use crate::matrix::Matrix;
 use crate::prime::ValidPrime;
-use crate::vector::{FpVector, Slice, SliceMut};
+use crate::vector::{prelude::*, FpVector, Slice, SliceMut};
 
 #[derive(Clone)]
 pub struct Subquotient {
@@ -47,7 +47,7 @@ impl Subquotient {
             if self.gens.pivots()[i] < 0 {
                 continue;
             }
-            let c = elt.as_slice().entry(i);
+            let c = elt.entry(i);
             result.push(c);
             if c != 0 {
                 elt.add(
@@ -175,6 +175,7 @@ impl Subquotient {
     }
 }
 
+#[cfg(feature = "odd-primes")]
 #[cfg(test)]
 mod test {
     use super::*;
diff --git a/ext/crates/fp/src/matrix/subspace.rs b/ext/crates/fp/src/matrix/subspace.rs
index 488a684b6a..29223cd26f 100644
--- a/ext/crates/fp/src/matrix/subspace.rs
+++ b/ext/crates/fp/src/matrix/subspace.rs
@@ -1,6 +1,6 @@
 use super::Matrix;
 use crate::prime::ValidPrime;
-use crate::vector::{FpVector, Slice, SliceMut};
+use crate::vector::{prelude::*, FpVector, Slice, SliceMut};
 
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use std::io::{Read, Write};
@@ -113,8 +113,8 @@ impl Subspace {
 
     /// Projects a vector to a complement of the subspace. The complement is the set of vectors
     /// that have a 0 in every column where there is a pivot in `matrix`
-    pub fn reduce(&self, mut vector: SliceMut) {
-        assert_eq!(vector.as_slice().len(), self.ambient_dimension());
+    pub fn reduce<T: BaseVectorMut>(&self, mut vector: T) {
+        assert_eq!(vector.len(), self.ambient_dimension());
         if self.matrix.rows() == 0 {
             return;
         }
@@ -127,7 +127,7 @@ impl Subspace {
             .map(|(col, _)| col)
             .zip(self.iter());
         for (col, row) in iter {
-            let c = vector.as_slice().entry(col);
+            let c = vector.entry(col);
             if c != 0 {
                 vector.add(row, *p - c);
             }
@@ -135,7 +135,7 @@ impl Subspace {
     }
 
     pub fn contains(&self, vector: Slice) -> bool {
-        let mut vector: FpVector = vector.to_owned();
+        let mut vector: FpVector = vector.into_owned();
         self.reduce(vector.as_slice_mut());
         vector.is_zero()
     }
diff --git a/ext/crates/fp/src/prime.rs b/ext/crates/fp/src/prime.rs
index 8e0fdc6513..711dfde8d4 100644
--- a/ext/crates/fp/src/prime.rs
+++ b/ext/crates/fp/src/prime.rs
@@ -1,6 +1,7 @@
+use std::convert::TryFrom;
+
 #[cfg(feature = "json")]
 use serde::{de::Error, Deserialize, Deserializer, Serialize, Serializer};
-use std::convert::TryFrom;
 
 use crate::constants::{
     BINOMIAL4_TABLE, BINOMIAL4_TABLE_SIZE, BINOMIAL_TABLE, INVERSE_TABLE, PRIME_TO_INDEX_MAP,
@@ -254,7 +255,6 @@ pub trait Binomial: Sized {
     /// This is easy to verify using the fact that
     ///
     ///    (x + y)^{2^k} = x^{2^k} + 2 x^{2^{k - 1}} y^{2^{k - 1}} + y^{2^k}
-    ///
     fn binomial4(n: Self, k: Self) -> Self;
 
     /// Compute binomial coefficients mod 4 using the recursion relation in the documentation of
@@ -321,6 +321,7 @@ macro_rules! impl_binomial {
                     0
                 }
             }
+
             #[inline]
             fn multinomial_odd(p_: ValidPrime, l: &mut [Self]) -> Self {
                 let p = *p_ as Self;
@@ -398,6 +399,7 @@ macro_rules! impl_binomial {
                 }
                 false
             }
+
             fn binomial4(n: Self, j: Self) -> Self {
                 if (n as usize) < BINOMIAL4_TABLE_SIZE {
                     return BINOMIAL4_TABLE[n as usize][j as usize] as Self;
@@ -465,6 +467,7 @@ impl BitflagIterator {
 
 impl Iterator for BitflagIterator {
     type Item = bool;
+
     fn next(&mut self) -> Option<Self::Item> {
         if self.remaining > 64 && self.flag == 0 || self.remaining == 0 {
             None
@@ -493,6 +496,7 @@ impl BinomialIterator {
 
 impl Iterator for BinomialIterator {
     type Item = u32;
+
     fn next(&mut self) -> Option<Self::Item> {
         let v = self.value;
         let c = v & v.wrapping_neg();
@@ -506,11 +510,12 @@ impl Iterator for BinomialIterator {
 #[cfg(test)]
 mod tests {
     use super::*;
+    // // #[cfg(feature = "odd-primes")]
     use crate::constants::PRIMES;
 
     #[test]
     fn inverse_test() {
-        for &p in PRIMES.iter() {
+        for p in PRIMES {
             let p = ValidPrime::new(p);
             for k in 1..*p {
                 assert_eq!((inverse(p, k) * k) % *p, 1);
@@ -520,8 +525,12 @@ mod tests {
 
     #[test]
     fn binomial_test() {
+        #[cfg(feature = "odd-primes")]
         let entries = [[2, 2, 1, 0], [2, 3, 1, 1], [3, 1090, 730, 1], [7, 3, 2, 3]];
 
+        #[cfg(not(feature = "odd-primes"))]
+        let entries = [[2, 2, 1, 0], [2, 3, 1, 1]];
+
         for entry in &entries {
             assert_eq!(
                 entry[3] as u32,
@@ -532,7 +541,7 @@ mod tests {
 
     #[test]
     fn binomial_vs_monomial() {
-        for &p in &[2, 3, 5, 7, 11] {
+        for p in PRIMES {
             let p = ValidPrime::new(p);
             for l in 0..20 {
                 for m in 0..20 {
@@ -558,7 +567,7 @@ mod tests {
         for n in 0..12 {
             for j in 0..=n {
                 let ans = binomial_full(n, j);
-                for &p in &[2, 3, 5, 7, 11] {
+                for p in PRIMES {
                     assert_eq!(
                         u32::binomial(ValidPrime::new(p), n, j),
                         ans % p,
diff --git a/ext/crates/fp/src/vector/base_generic.rs b/ext/crates/fp/src/vector/base_generic.rs
new file mode 100644
index 0000000000..9845ac1784
--- /dev/null
+++ b/ext/crates/fp/src/vector/base_generic.rs
@@ -0,0 +1,150 @@
+use super::{
+    generic::{FpVectorIterator, FpVectorNonZeroIteratorP, FpVectorP, SliceMutP, SliceP},
+    internal::{InternalBaseVectorMutP, InternalBaseVectorP},
+};
+use crate::{limb::LimbLength, prime::ValidPrime};
+
+pub trait BaseVectorP<const P: u32>: InternalBaseVectorP<P> {
+    fn prime(&self) -> ValidPrime;
+    fn len(&self) -> usize;
+    fn is_empty(&self) -> bool;
+    fn entry(&self, index: usize) -> u32;
+    fn slice<'a>(&self, start: usize, end: usize) -> SliceP<'a, P>
+    where
+        Self: 'a;
+    fn as_slice(&self) -> SliceP<P>;
+    fn is_zero(&self) -> bool;
+    fn iter(&self) -> FpVectorIterator;
+    fn iter_nonzero(&self) -> FpVectorNonZeroIteratorP<P>;
+    fn first_nonzero(&self) -> Option<(usize, u32)>;
+    fn sign_rule<T: BaseVectorP<P>>(&self, other: T) -> bool;
+    fn into_owned(self) -> FpVectorP<P>;
+    fn density(&self) -> f32;
+}
+
+pub trait BaseVectorMutP<const P: u32>: InternalBaseVectorMutP<P> + BaseVectorP<P> {
+    fn scale(&mut self, c: u32);
+    fn set_to_zero(&mut self);
+    fn set_entry(&mut self, index: usize, value: u32);
+    fn assign<T: BaseVectorP<P>>(&mut self, other: T);
+    fn add<T: BaseVectorP<P>>(&mut self, other: T, c: u32);
+    fn add_offset<T: BaseVectorP<P>>(&mut self, other: T, c: u32, offset: usize);
+    fn slice_mut(&mut self, start: usize, end: usize) -> SliceMutP<P>;
+    fn as_slice_mut(&mut self) -> SliceMutP<P>;
+    fn add_basis_element(&mut self, index: usize, value: u32);
+    fn copy_from_slice(&mut self, slice: &[u32]);
+    fn add_masked<T: BaseVectorP<P>>(&mut self, other: T, c: u32, mask: &[usize]);
+    fn add_unmasked<T: BaseVectorP<P>>(&mut self, other: T, c: u32, mask: &[usize]);
+    fn add_truncate<T: BaseVectorP<P>>(&mut self, other: T, c: u32) -> Option<()>;
+}
+
+impl<T: InternalBaseVectorP<P>, const P: u32> BaseVectorP<P> for T {
+    fn prime(&self) -> ValidPrime {
+        self._prime()
+    }
+
+    fn len(&self) -> usize {
+        self._len().logical()
+    }
+
+    fn is_empty(&self) -> bool {
+        self._is_empty()
+    }
+
+    fn entry(&self, index: usize) -> u32 {
+        self._entry(index)
+    }
+
+    fn slice<'a>(&self, start: usize, end: usize) -> SliceP<'a, P>
+    where
+        Self: 'a,
+    {
+        self._slice(LimbLength::from_start_end(start, end))
+    }
+
+    fn as_slice(&self) -> SliceP<P> {
+        self._as_slice()
+    }
+
+    fn is_zero(&self) -> bool {
+        self._is_zero()
+    }
+
+    fn iter(&self) -> FpVectorIterator {
+        self._iter()
+    }
+
+    fn iter_nonzero(&self) -> FpVectorNonZeroIteratorP<P> {
+        self._iter_nonzero()
+    }
+
+    fn first_nonzero(&self) -> Option<(usize, u32)> {
+        self._first_nonzero()
+    }
+
+    fn sign_rule<S: BaseVectorP<P>>(&self, other: S) -> bool {
+        self._sign_rule(other)
+    }
+
+    fn into_owned(self) -> FpVectorP<P> {
+        self._into_owned()
+    }
+
+    fn density(&self) -> f32 {
+        self._density()
+    }
+}
+
+impl<T: InternalBaseVectorMutP<P>, const P: u32> BaseVectorMutP<P> for T {
+    fn scale(&mut self, c: u32) {
+        self._scale(c)
+    }
+
+    fn set_to_zero(&mut self) {
+        self._set_to_zero()
+    }
+
+    fn set_entry(&mut self, index: usize, value: u32) {
+        self._set_entry(index, value)
+    }
+
+    fn assign<S: BaseVectorP<P>>(&mut self, other: S) {
+        self._assign(other)
+    }
+
+    fn add<S: BaseVectorP<P>>(&mut self, other: S, c: u32) {
+        self._add(other, c)
+    }
+
+    fn add_offset<S: BaseVectorP<P>>(&mut self, other: S, c: u32, offset: usize) {
+        self._add_offset(other, c, offset)
+    }
+
+    fn slice_mut(&mut self, start: usize, end: usize) -> SliceMutP<P> {
+        self._slice_mut(LimbLength::from_start_end(start, end))
+    }
+
+    fn as_slice_mut(&mut self) -> SliceMutP<P> {
+        self._as_slice_mut()
+    }
+
+    fn add_basis_element(&mut self, index: usize, value: u32) {
+        self._add_basis_element(index, value)
+    }
+
+    fn copy_from_slice(&mut self, slice: &[u32]) {
+        self._copy_from_slice(slice)
+    }
+
+    fn add_masked<S: BaseVectorP<P>>(&mut self, other: S, c: u32, mask: &[usize]) {
+        self._add_masked(other, c, mask)
+    }
+
+    fn add_unmasked<S: BaseVectorP<P>>(&mut self, other: S, c: u32, mask: &[usize]) {
+        self._add_unmasked(other, c, mask)
+    }
+
+    fn add_truncate<S: BaseVectorP<P>>(&mut self, other: S, c: u32) -> Option<()> {
+        self._add_truncate(other, c)
+    }
+}
diff --git a/ext/crates/fp/src/vector/generic.rs b/ext/crates/fp/src/vector/generic.rs
new file mode 100644
index 0000000000..de2386f184
--- /dev/null
+++ b/ext/crates/fp/src/vector/generic.rs
@@ -0,0 +1,463 @@
+use std::io::{Read, Write};
+
+use super::{
+    base_generic::{BaseVectorMutP, BaseVectorP},
+    internal::{InternalBaseVectorMutP, InternalBaseVectorP},
+};
+use crate::{
+    constants,
+    limb::{self, Limb, LimbLength},
+    prime::ValidPrime,
+};
+
+/// An `FpVectorP` is a vector over $\mathbb{F}_p$ for a fixed prime, implemented using const
+/// generics. Due to limitations with const generics, we cannot constrain P to actually be a prime,
+/// so we allow it to be any u32. However, most functions will panic if P is not a prime.
+///
+/// Interally, it packs entries of the vectors into limbs. However, this is an abstraction that
+/// must not leave the `fp` library.
+#[derive(Debug, Hash, Eq, PartialEq, Clone)]
+pub struct FpVectorP<const P: u32> {
+    /// The limbs containing the entries of the (mathematical) vector. At all times, `limbs` must be
+    /// at least `len.limbs()` long, but is allowed to be larger.
+    pub(crate) limbs: Vec<Limb>,
+    pub(crate) len: LimbLength<P>,
+}
+
+/// A SliceP is a slice of an FpVectorP. This immutably borrows the vector and implements Copy.
+#[derive(Debug, Copy, Clone)]
+pub struct SliceP<'a, const P: u32> {
+    pub(crate) limbs: &'a [Limb],
+    pub(crate) range: LimbLength<P>,
+}
+
+/// A `SliceMutP` is a mutable slice of an `FpVectorP`. This mutably borrows the vector. Since it
+/// is a mutable borrow, it cannot implement `Copy`. However, it has a [`SliceMutP::copy`] function
+/// that imitates the reborrowing, that mutably borrows `SliceMutP` and returns a `SliceMutP` with
+/// a shorter lifetime.
+#[derive(Debug)]
+pub struct SliceMutP<'a, const P: u32> {
+    pub(crate) limbs: &'a mut [Limb],
+    pub(crate) range: LimbLength<P>,
+}
+
+impl<const P: u32> FpVectorP<P> {
+    pub fn new_(len: usize) -> Self {
+        let length = LimbLength::<P>::from_logical(len);
+        Self {
+            limbs: vec![0; length.limbs()],
+            len: length,
+        }
+    }
+
+    pub fn new_with_capacity_(len: usize, capacity: usize) -> Self {
+        let length = LimbLength::<P>::from_logical(len);
+        let mut limbs = Vec::with_capacity(limb::number::<P>(capacity));
+        limbs.resize(length.limbs(), 0);
+        Self { limbs, len: length }
+    }
+
+    // /// A version of [`FpVectorP::assign`] that allows `other` to be shorter than `self`.
+    pub fn assign_partial(&mut self, other: &Self) {
+        debug_assert!(other.len() <= self.len());
+        self.limbs[0..other.limbs.len()].copy_from_slice(&other.limbs);
+        for limb in self.limbs[other.limbs.len()..].iter_mut() {
+            *limb = 0;
+        }
+    }
+
+    /// This function ensures the length of the vector is at least `len`. See also
+    /// `set_scratch_vector_size`.
+    pub fn extend_len(&mut self, len: usize) {
+        if self.len() >= len {
+            return;
+        }
+        self.len = LimbLength::<P>::from_logical(len);
+        self.limbs.resize(self.len.limbs(), 0);
+    }
+
+    /// This clears the vector and sets the length to `len`. This is useful for reusing
+    /// allocations of temporary vectors.
+    pub fn set_scratch_vector_size(&mut self, len: usize) {
+        self.len = LimbLength::<P>::from_logical(len);
+        self.limbs.clear();
+        self.limbs.resize(self.len.limbs(), 0);
+    }
+
+    /// Permanently remove the first `n` elements in the vector. `n` must be a multiple of
+    /// the number of entries per limb
+    pub(crate) fn trim_start(&mut self, n: usize) {
+        assert!(n <= self.len.logical());
+        let entries_per = limb::entries_per_limb_const::<P>();
+        assert_eq!(n % entries_per, 0);
+        let num_limbs = n / entries_per;
+        self.limbs.drain(0..num_limbs);
+        self.len = self.len.trim_start(n);
+    }
+
+    fn add_carry_limb<T>(&mut self, idx: usize, source: Limb, c: u32, rest: &mut [T]) -> bool
+    where
+        for<'a> &'a mut T: TryInto<&'a mut Self>,
+    {
+        if P == 2 {
+            if c == 0 {
+                return false;
+            }
+            let mut cur_vec = self;
+            let mut carry = source;
+            for carry_vec in rest.iter_mut() {
+                let carry_vec = carry_vec
+                    .try_into()
+                    .ok()
+                    .expect("rest vectors in add_carry must be of the same prime");
+                let rem = cur_vec.limbs[idx] ^ carry;
+                let quot = cur_vec.limbs[idx] & carry;
+                cur_vec.limbs[idx] = rem;
+                carry = quot;
+                cur_vec = carry_vec;
+                if quot == 0 {
+                    return false;
+                }
+            }
+            cur_vec.limbs[idx] ^= carry;
+            true
+        } else {
+            unimplemented!()
+        }
+    }
+
+    pub fn add_carry<T>(&mut self, other: &Self, c: u32, rest: &mut [T]) -> bool
+    where
+        for<'a> &'a mut T: TryInto<&'a mut Self>,
+    {
+        let mut result = false;
+        for i in 0..self.limbs.len() {
+            result |= self.add_carry_limb(i, other.limbs[i], c, rest);
+        }
+        result
+    }
+
+    pub fn update_from_bytes(&mut self, data: &mut impl Read) -> std::io::Result<()> {
+        let limbs = &mut self.limbs;
+        let num_limbs = limbs.len();
+
+        if cfg!(target_endian = "little") {
+            let num_bytes = num_limbs * constants::BYTES_PER_LIMB;
+            unsafe {
+                let buf: &mut [u8] =
+                    std::slice::from_raw_parts_mut(limbs.as_mut_ptr() as *mut u8, num_bytes);
+                data.read_exact(buf).unwrap();
+            }
+        } else {
+            for entry in limbs {
+                let mut bytes: [u8; constants::BYTES_PER_LIMB] = [0; constants::BYTES_PER_LIMB];
+                data.read_exact(&mut bytes)?;
+                *entry = Limb::from_le_bytes(bytes);
+            }
+        };
+        Ok(())
+    }
+
+    pub fn from_bytes(_p: ValidPrime, len: usize, data: &mut impl Read) -> std::io::Result<Self> {
+        let mut v = Self::new_(len);
+        v.update_from_bytes(data)?;
+        Ok(v)
+    }
+
+    pub fn to_bytes(&self, buffer: &mut impl Write) -> std::io::Result<()> {
+        // self.limbs is allowed to have more limbs than necessary, but we only save the
+        // necessary ones.
+        let num_limbs = limb::number::<P>(self.len());
+
+        if cfg!(target_endian = "little") {
+            let num_bytes = num_limbs * constants::BYTES_PER_LIMB;
+            unsafe {
+                let buf: &[u8] =
+                    std::slice::from_raw_parts_mut(self.limbs.as_ptr() as *mut u8, num_bytes);
+                buffer.write_all(buf)?;
+            }
+        } else {
+            for limb in &self.limbs[0..num_limbs] {
+                let bytes = limb.to_le_bytes();
+                buffer.write_all(&bytes)?;
+            }
+        }
+        Ok(())
+    }
+
+    pub(crate) fn limbs(&self) -> &[Limb] {
+        self._limbs()
+    }
+
+    pub(crate) fn limbs_mut(&mut self) -> &mut [Limb] {
+        self._limbs_mut()
+    }
+}
+
+impl<'a, const P: u32> From<&'a FpVectorP<P>> for SliceP<'a, P> {
+    fn from(v: &'a FpVectorP<P>) -> Self {
+        v.as_slice()
+    }
+}
+
+impl<'a, const P: u32> From<&'a mut FpVectorP<P>> for SliceP<'a, P> {
+    fn from(v: &'a mut FpVectorP<P>) -> Self {
+        (v as &'a FpVectorP<P>).as_slice()
+    }
+}
+
+impl<'a, const P: u32> From<&'a mut FpVectorP<P>> for SliceMutP<'a, P> {
+    fn from(v: &'a mut FpVectorP<P>) -> Self {
+        v.as_slice_mut()
+    }
+}
+
+impl<'a, const P: u32> SliceMutP<'a, P> {
+    /// `coeff` need not be reduced mod p.
+    /// Adds v otimes w to self.
+    pub fn add_tensor(&mut self, offset: usize, coeff: u32, left: SliceP<P>, right: SliceP<P>) {
+        let right_dim = right.len();
+
+        for (i, v) in left.iter_nonzero() {
+            let entry = (v * coeff) % *self.prime();
+            self.slice_mut(offset + i * right_dim, offset + (i + 1) * right_dim)
+                .add(right, entry);
+        }
+    }
+
+    /// Generates a version of itself with a shorter lifetime
+    #[inline]
+    pub fn copy(&mut self) -> SliceMutP<'_, P> {
+        SliceMutP {
+            limbs: self.limbs,
+            range: self.range,
+        }
+    }
+}
+
+impl<'a, 'b, const P: u32> From<&'a mut SliceMutP<'b, P>> for SliceMutP<'a, P> {
+    fn from(slice: &'a mut SliceMutP<'b, P>) -> SliceMutP<'a, P> {
+        slice.copy()
+    }
+}
+
+impl<'a, 'b, const P: u32> From<&'a SliceP<'b, P>> for SliceP<'a, P> {
+    fn from(slice: &'a SliceP<'b, P>) -> SliceP<'a, P> {
+        *slice
+    }
+}
+
+impl<'a, 'b, const P: u32> From<&'a SliceMutP<'b, P>> for SliceP<'a, P> {
+    fn from(slice: &'a SliceMutP<'b, P>) -> SliceP<'a, P> {
+        slice.as_slice()
+    }
+}
+
+impl<T: AsRef<[u32]>, const P: u32> From<&T> for FpVectorP<P> {
+    fn from(slice: &T) -> Self {
+        let mut v = Self::new_(slice.as_ref().len());
+        v.limbs.clear();
+        v.limbs.extend(
+            slice
+                .as_ref()
+                .chunks(limb::entries_per_limb_const::<P>())
+                .map(|x| limb::pack::<_, P>(x.iter().copied())),
+        );
+        v
+    }
+}
+
+impl<const P: u32> From<&FpVectorP<P>> for Vec<u32> {
+    fn from(vec: &FpVectorP<P>) -> Vec<u32> {
+        vec.iter().collect()
+    }
+}
+
+// Iterators
+
+pub struct FpVectorIterator<'a> {
+    limbs: &'a [Limb],
+    bit_length: usize,
+    bit_mask: Limb,
+    entries_per_limb_m_1: usize,
+    limb_index: usize,
+    entries_left: usize,
+    cur_limb: Limb,
+    counter: usize,
+}
+
+impl<'a> FpVectorIterator<'a> {
+    pub(crate) fn new<T: InternalBaseVectorP<P> + 'a, const P: u32>(vec: &'a T) -> Self {
+        let counter = vec._len().logical();
+        let limbs = vec._limbs();
+
+        if counter == 0 {
+            return Self {
+                limbs,
+                bit_length: 0,
+                entries_per_limb_m_1: 0,
+                bit_mask: 0,
+                limb_index: 0,
+                entries_left: 0,
+                cur_limb: 0,
+                counter,
+            };
+        }
+        let pair = limb::limb_bit_index_pair::<P>(vec._len().start);
+
+        let bit_length = limb::bit_length_const::<P>();
+        let cur_limb = limbs[pair.limb] >> pair.bit_index;
+
+        let entries_per_limb = limb::entries_per_limb_const::<P>();
+        Self {
+            limbs,
+            bit_length,
+            entries_per_limb_m_1: entries_per_limb - 1,
+            bit_mask: limb::bitmask::<P>(),
+            limb_index: pair.limb,
+            entries_left: entries_per_limb - (vec._len().start % entries_per_limb),
+            cur_limb,
+            counter,
+        }
+    }
+
+    pub fn skip_n(&mut self, mut n: usize) {
+        if n >= self.counter {
+            self.counter = 0;
+            return;
+        }
+        let entries_per_limb = self.entries_per_limb_m_1 + 1;
+        if n < self.entries_left {
+            self.entries_left -= n;
+            self.counter -= n;
+            self.cur_limb >>= self.bit_length * n;
+            return;
+        }
+
+        n -= self.entries_left;
+        self.counter -= self.entries_left;
+        self.entries_left = 0;
+
+        let skip_limbs = n / entries_per_limb;
+        self.limb_index += skip_limbs;
+        self.counter -= skip_limbs * entries_per_limb;
+        n -= skip_limbs * entries_per_limb;
+
+        if n > 0 {
+            self.entries_left = entries_per_limb - n;
+            self.limb_index += 1;
+            self.cur_limb = self.limbs[self.limb_index] >> (n * self.bit_length);
+            self.counter -= n;
+        }
+    }
+}
+
+impl<'a> Iterator for FpVectorIterator<'a> {
+    type Item = u32;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.counter == 0 {
+            return None;
+        } else if self.entries_left == 0 {
+            self.limb_index += 1;
+            self.cur_limb = self.limbs[self.limb_index];
+            self.entries_left = self.entries_per_limb_m_1;
+        } else {
+            self.entries_left -= 1;
+        }
+
+        let result = (self.cur_limb & self.bit_mask) as u32;
+        self.counter -= 1;
+        self.cur_limb >>= self.bit_length;
+
+        Some(result)
+    }
+}
+
+impl<'a> ExactSizeIterator for FpVectorIterator<'a> {
+    fn len(&self) -> usize {
+        self.counter
+    }
+}
+
+/// Iterator over non-zero entries of an FpVector. This is monomorphized over P for significant
+/// performance gains.
+pub struct FpVectorNonZeroIteratorP<'a, const P: u32> {
+    limbs: &'a [Limb],
+    limb_index: usize,
+    cur_limb_entries_left: usize,
+    cur_limb: Limb,
+    idx: usize,
+    dim: usize,
+}
+
+impl<'a, const P: u32> FpVectorNonZeroIteratorP<'a, P> {
+    pub(crate) fn new<T: InternalBaseVectorP<P> + 'a>(vec: &'a T) -> Self {
+        let entries_per_limb = limb::entries_per_limb_const::<P>();
+
+        let dim = vec._len().logical();
+        let limbs = vec._limbs();
+
+        if dim == 0 {
+            return Self {
+                limbs,
+                limb_index: 0,
+                cur_limb_entries_left: 0,
+                cur_limb: 0,
+                idx: 0,
+                dim: 0,
+            };
+        }
+        let min_index = vec._len().start;
+        let pair = limb::limb_bit_index_pair::<P>(vec._len().start);
+        let cur_limb = limbs[pair.limb] >> pair.bit_index;
+        let cur_limb_entries_left = entries_per_limb - (min_index % entries_per_limb);
+        Self {
+            limbs,
+            limb_index: pair.limb,
+            cur_limb_entries_left,
+            cur_limb,
+            idx: 0,
+            dim,
+        }
+    }
+}
+
+impl<'a, const P: u32> Iterator for FpVectorNonZeroIteratorP<'a, P> {
+    type Item = (usize, u32);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let bit_length: usize = limb::bit_length_const::<P>();
+        let bitmask: Limb = limb::bitmask::<P>();
+        let entries_per_limb: usize = limb::entries_per_limb_const::<P>();
+        loop {
+            let bits_left = (self.cur_limb_entries_left * bit_length) as u32;
+            #[allow(clippy::unnecessary_cast)]
+            let tz_real = (self.cur_limb | (1 as Limb).checked_shl(bits_left as u32).unwrap_or(0))
+                .trailing_zeros();
+            let tz_rem = ((tz_real as u8) % (bit_length as u8)) as u32;
+            let tz_div = ((tz_real as u8) / (bit_length as u8)) as u32;
+            let tz = tz_real - tz_rem;
+            self.idx += tz_div as usize;
+            if self.idx >= self.dim {
+                return None;
+            }
+            self.cur_limb_entries_left -= tz_div as usize;
+            if self.cur_limb_entries_left == 0 {
+                self.limb_index += 1;
+                self.cur_limb_entries_left = entries_per_limb;
+                self.cur_limb = self.limbs[self.limb_index];
+                continue;
+            }
+            self.cur_limb >>= tz;
+            if tz == 0 {
+                break;
+            }
+        }
+        let result = (self.idx, (self.cur_limb & bitmask) as u32);
+        self.idx += 1;
+        self.cur_limb_entries_left -= 1;
+        self.cur_limb >>= bit_length;
+        Some(result)
+    }
+}
diff --git a/ext/crates/fp/src/vector/internal/impl_internal.rs b/ext/crates/fp/src/vector/internal/impl_internal.rs
new file mode 100644
index 0000000000..962d17b3f5
--- /dev/null
+++ b/ext/crates/fp/src/vector/internal/impl_internal.rs
@@ -0,0 +1,232 @@
+use itertools::Itertools;
+
+use super::{InternalBaseVectorMutP, InternalBaseVectorP};
+use crate::{
+    limb::{self, Limb, LimbLength},
+    simd,
+    vector::generic::{FpVectorP, SliceMutP, SliceP},
+};
+
+impl<const P: u32> InternalBaseVectorP<P> for FpVectorP<P> {
+    fn _as_ptr(&self) -> *const Limb {
+        self.limbs.as_ptr()
+    }
+
+    fn _len(&self) -> LimbLength<P> {
+        self.len
+    }
+
+    fn _is_zero(&self) -> bool {
+        self.limbs.iter().all(|&x| x == 0)
+    }
+
+    fn _limbs(&self) -> &[Limb] {
+        &self.limbs
+    }
+
+    fn _into_owned(self) -> FpVectorP<P> {
+        self
+    }
+
+    fn _first_nonzero(&self) -> Option<(usize, u32)> {
+        let entries_per_limb = limb::entries_per_limb_const::<P>();
+        let bit_length = limb::bit_length_const::<P>();
+        let bitmask = limb::bitmask::<P>();
+        for (i, &limb) in self._limbs().iter().enumerate() {
+            if limb == 0 {
+                continue;
+            }
+            let index = limb.trailing_zeros() as usize / bit_length;
+            return Some((
+                i * entries_per_limb + index,
+                ((limb >> (index * bit_length)) & bitmask) as u32,
+            ));
+        }
+        None
+    }
+
+    fn _density(&self) -> f32 {
+        (if P == 2 {
+            self.limbs
+                .iter()
+                .copied()
+                .map(Limb::count_ones)
+                .sum::<u32>() as usize
+        } else {
+            self._iter_nonzero().count()
+        }) as f32
+            / self._len().logical() as f32
+    }
+}
+
+impl<const P: u32> InternalBaseVectorMutP<P> for FpVectorP<P> {
+    fn _as_mut_ptr(&mut self) -> *mut Limb {
+        self.limbs.as_mut_ptr()
+    }
+
+    fn _set_to_zero(&mut self) {
+        self.limbs.fill(0);
+    }
+
+    fn _scale(&mut self, c: u32) {
+        match P {
+            2 => {
+                if c == 0 {
+                    self._set_to_zero()
+                }
+            }
+            3 | 5 => {
+                for limb in &mut self.limbs {
+                    *limb = limb::reduce::<P>(*limb * c as Limb);
+                }
+            }
+            _ => {
+                for limb in &mut self.limbs {
+                    *limb = limb::pack::<_, P>(limb::unpack::<P>(*limb).map(|x| (x * c) % P));
+                }
+            }
+        }
+    }
+
+    fn _add_offset<T: InternalBaseVectorP<P>>(&mut self, other: T, c: u32, offset: usize) {
+        debug_assert_eq!(
+            other._len().start,
+            0,
+            "`FpVector::add_offset` only supports limb-aligned arguments"
+        );
+        debug_assert_eq!(self._len().logical(), other._len().logical());
+        let min_limb = offset / limb::entries_per_limb_const::<P>();
+        if P == 2 {
+            if c != 0 {
+                simd::add_simd(&mut self.limbs, other._limbs(), min_limb);
+            }
+        } else {
+            for (left, right) in self.limbs.iter_mut().zip(other._limbs()).skip(min_limb) {
+                *left = limb::add::<P>(*left, *right, c);
+            }
+            for limb in &mut self.limbs[min_limb..] {
+                *limb = limb::reduce::<P>(*limb);
+            }
+        }
+    }
+
+    fn _assign<T: InternalBaseVectorP<P>>(&mut self, other: T) {
+        debug_assert_eq!(self._len().logical(), other._len().logical());
+        let other_num_limbs = other._len().limbs();
+        let shift = other._len().bit_offset();
+
+        self.limbs.resize(other_num_limbs, 0);
+        self.limbs
+            .copy_from_slice(&other._limbs()[..other_num_limbs]);
+
+        if shift > 0 {
+            let mut borrow = 0;
+            let borrow_shift = limb::used_bits::<P>() - shift;
+            for elem in self.limbs.iter_mut().rev() {
+                let new_borrow = *elem << borrow_shift;
+                *elem = ((*elem >> shift) | borrow) & limb::used_mask::<P>();
+                borrow = new_borrow;
+            }
+        }
+
+        // Potentially useless, but otherwise we can end up with nonzero limbs past the end of the
+        // vector. That doesn't seem to cause a problem for now but it might down the road.
+        self.limbs.truncate(self._len().limbs());
+
+        for limb in self.limbs.iter() {
+            debug_assert_eq!(limb & !limb::used_mask::<P>(), 0);
+        }
+    }
+
+    fn _limbs_mut(&mut self) -> &mut [Limb] {
+        &mut self.limbs
+    }
+
+    fn _copy_from_slice(&mut self, slice: &[u32]) {
+        assert_eq!(self._len().logical(), slice.len());
+
+        self.limbs.clear();
+        self.limbs.extend(
+            slice
+                .chunks(limb::entries_per_limb_const::<P>())
+                .map(|x| limb::pack::<_, P>(x.iter().copied())),
+        );
+    }
+
+    fn _add_truncate<T: InternalBaseVectorP<P>>(&mut self, other: T, c: u32) -> Option<()> {
+        // We require `other` to start on a limb boundary. In practice we only ever call this
+        // function with `other: FpVectorP`, which satisfies this condition by definition.
+        debug_assert_eq!(other._len().start, 0);
+        for (left, right) in self.limbs.iter_mut().zip_eq(other._limbs()) {
+            *left = limb::add::<P>(*left, *right, c);
+            *left = limb::truncate::<P>(*left)?;
+        }
+        Some(())
+    }
+}
+
+impl<'a, const P: u32> InternalBaseVectorP<P> for SliceP<'a, P> {
+    fn _as_ptr(&self) -> *const Limb {
+        self.limbs.as_ptr()
+    }
+
+    fn _len(&self) -> LimbLength<P> {
+        self.range
+    }
+
+    fn _limbs(&self) -> &[Limb] {
+        self.limbs
+    }
+}
+
+impl<'a, const P: u32> InternalBaseVectorP<P> for SliceMutP<'a, P> {
+    fn _as_ptr(&self) -> *const Limb {
+        self.limbs.as_ptr()
+    }
+
+    fn _len(&self) -> LimbLength<P> {
+        self.range
+    }
+
+    fn _limbs(&self) -> &[Limb] {
+        self.limbs
+    }
+}
+
+impl<'a, const P: u32> InternalBaseVectorMutP<P> for SliceMutP<'a, P> {
+    fn _as_mut_ptr(&mut self) -> *mut Limb {
+        self.limbs.as_mut_ptr()
+    }
+
+    fn _limbs_mut(&mut self) -> &mut [Limb] {
+        self.limbs
+    }
+}
+
+// Tautological impls
+
+impl<T: InternalBaseVectorP<P>, const P: u32> InternalBaseVectorP<P> for &T {
+    fn _as_ptr(&self) -> *const Limb {
+        T::_as_ptr(self)
+    }
+
+    fn _len(&self) -> LimbLength<P> {
+        T::_len(self)
+    }
+}
+
+impl<T: InternalBaseVectorP<P>, const P: u32> InternalBaseVectorP<P> for &mut T {
+    fn _as_ptr(&self) -> *const Limb {
+        T::_as_ptr(self)
+    }
+
+    fn _len(&self) -> LimbLength<P> {
+        T::_len(self)
+    }
+}
+
+impl<T: InternalBaseVectorMutP<P>, const P: u32> InternalBaseVectorMutP<P> for &mut T {
+    fn _as_mut_ptr(&mut self) -> *mut Limb {
+        T::_as_mut_ptr(self)
+    }
+}
diff --git a/ext/crates/fp/src/vector/internal/mod.rs b/ext/crates/fp/src/vector/internal/mod.rs
new file mode 100644
index 0000000000..330e751e49
--- /dev/null
+++ b/ext/crates/fp/src/vector/internal/mod.rs
@@ -0,0 +1,686 @@
+use std::cmp::Ordering;
+
+use super::generic::{FpVectorIterator, FpVectorNonZeroIteratorP, FpVectorP, SliceMutP, SliceP};
+use crate::{
+    constants,
+    limb::{self, Limb, LimbLength},
+    prime::ValidPrime,
+};
+
+mod impl_internal;
+
+pub trait InternalBaseVectorP<const P: u32>: Sized {
+    /// Returns a pointer to the allocation containing the actual data. This is a raw pointer and
+    /// does not take lifetimes into account. It is the responsibility of the caller to ensure that
+    /// the pointer is not dereferenced after the allocation is freed.
+    ///
+    /// We use a pointer instead of a slice because otherwise handling the lifetimes is a huge mess.
+    /// In practice it is almost always better to use [`InternalBaseVectorP::_limbs`] to manipulate
+    /// the underlying data.
+    fn _as_ptr(&self) -> *const Limb;
+
+    /// Returns a description of the vector as a [`LimbLength`]. See there for the available
+    /// information.
+    fn _len(&self) -> LimbLength<P>;
+
+    fn _limbs(&self) -> &[Limb] {
+        unsafe { std::slice::from_raw_parts(self._as_ptr(), self._len().limbs()) }
+    }
+
+    fn _prime(&self) -> ValidPrime {
+        ValidPrime::new(P)
+    }
+
+    fn _is_empty(&self) -> bool {
+        self._len().logical() == 0
+    }
+
+    fn _is_zero(&self) -> bool {
+        let limb_range = self._len().limb_range();
+        if limb_range.is_empty() {
+            return true;
+        }
+        let (min_mask, max_mask) = self._len().limb_masks();
+        if self._limbs()[limb_range.start] & min_mask != 0 {
+            return false;
+        }
+
+        let inner_range = self._len().limb_range_inner();
+        if self._limbs()[inner_range].iter().any(|&x| x != 0) {
+            return false;
+        }
+        if self._limbs()[limb_range.end - 1] & max_mask != 0 {
+            return false;
+        }
+        true
+    }
+
+    fn _slice<'a>(&self, range: LimbLength<P>) -> SliceP<'a, P>
+    where
+        Self: 'a,
+    {
+        let (new_len, offset) = self._len().restrict_to(range).apply_shift();
+        let limbs_ptr = unsafe { self._as_ptr().add(offset) };
+        let limbs = unsafe { std::slice::from_raw_parts(limbs_ptr, new_len.limbs()) };
+        SliceP {
+            limbs,
+            range: new_len,
+        }
+    }
+
+    fn _as_slice(&self) -> SliceP<P> {
+        SliceP {
+            limbs: self._limbs(),
+            range: self._len(),
+        }
+    }
+
+    fn _entry(&self, index: usize) -> u32 {
+        debug_assert!(
+            index < self._len().logical(),
+            "Index {} too large, length of vector is only {}.",
+            index,
+            self._len().logical()
+        );
+        let bit_mask = limb::bitmask::<P>();
+        let limb_index = limb::limb_bit_index_pair::<P>(index + self._len().start);
+        let mut result = self._limbs()[limb_index.limb];
+        result >>= limb_index.bit_index;
+        result &= bit_mask;
+        result as u32
+    }
+
+    fn _iter(&self) -> FpVectorIterator {
+        FpVectorIterator::new(self)
+    }
+
+    fn _iter_nonzero(&self) -> FpVectorNonZeroIteratorP<P> {
+        FpVectorNonZeroIteratorP::new(self)
+    }
+
+    fn _first_nonzero(&self) -> Option<(usize, u32)> {
+        todo!();
+    }
+
+    fn _sign_rule<S: InternalBaseVectorP<P>>(&self, other: S) -> bool {
+        assert_eq!(P, 2);
+        let mut result = 0;
+        for target_limb_idx in 0..self._limbs().len() {
+            let target_limb = other._limbs()[target_limb_idx];
+            let source_limb = self._limbs()[target_limb_idx];
+            result ^= limb::sign_rule(target_limb, source_limb);
+            if target_limb.count_ones() % 2 == 0 {
+                continue;
+            }
+            for _ in 0..target_limb_idx {
+                result ^= source_limb.count_ones() % 2;
+            }
+        }
+        result == 1
+    }
+
+    fn _into_owned(self) -> FpVectorP<P> {
+        let mut new = FpVectorP::<P>::new_(self._len().logical());
+        if self._len().start % limb::entries_per_limb_const::<P>() == 0 {
+            let limb_range = self._len().limb_range();
+            new.limbs[0..limb_range.len()].copy_from_slice(&self._limbs()[limb_range]);
+            if !new.limbs.is_empty() {
+                let len = new.limbs.len();
+                new.limbs[len - 1] &= self._len().limb_masks().1;
+            }
+        } else {
+            new._assign(self);
+        }
+        new
+    }
+
+    fn _density(&self) -> f32 {
+        self._iter_nonzero().count() as f32 / self._len().logical() as f32
+    }
+}
+
+pub trait InternalBaseVectorMutP<const P: u32>: InternalBaseVectorP<P> {
+    fn _as_mut_ptr(&mut self) -> *mut Limb;
+
+    fn _limbs_mut(&mut self) -> &mut [Limb] {
+        unsafe { std::slice::from_raw_parts_mut(self._as_mut_ptr(), self._len().limbs()) }
+    }
+
+    fn _slice_mut(&mut self, range: LimbLength<P>) -> SliceMutP<P> {
+        let (new_len, offset) = self._len().restrict_to(range).apply_shift();
+        let limbs_ptr = unsafe { self._as_mut_ptr().add(offset) };
+        let limbs = unsafe { std::slice::from_raw_parts_mut(limbs_ptr, new_len.limbs()) };
+        SliceMutP {
+            limbs,
+            range: new_len,
+        }
+    }
+
+    fn _as_slice_mut(&mut self) -> SliceMutP<P> {
+        let range = self._len();
+        SliceMutP {
+            limbs: self._limbs_mut(),
+            range,
+        }
+    }
+
+    fn _add<T: InternalBaseVectorP<P>>(&mut self, other: T, c: u32) {
+        debug_assert!(c < P);
+        if self._is_empty() {
+            return;
+        }
+
+        if P == 2 {
+            if c != 0 {
+                match self._len().bit_offset().cmp(&other._len().bit_offset()) {
+                    Ordering::Equal => self._add_shift_none(other, 1),
+                    Ordering::Less => self._add_shift_left(other, 1),
+                    Ordering::Greater => self._add_shift_right(other, 1),
+                };
+            }
+        } else {
+            match self._len().bit_offset().cmp(&other._len().bit_offset()) {
+                Ordering::Equal => self._add_shift_none(other, c),
+                Ordering::Less => self._add_shift_left(other, c),
+                Ordering::Greater => self._add_shift_right(other, c),
+            };
+        }
+    }
+
+    fn _add_shift_none<T: InternalBaseVectorP<P>>(&mut self, other: T, c: u32) {
+        let target_range = self._len().limb_range();
+        let source_range = other._len().limb_range();
+
+        let (min_mask, max_mask) = other._len().limb_masks();
+        let other_limbs = other._limbs();
+
+        self._limbs_mut()[target_range.start] = limb::add::<P>(
+            self._limbs_mut()[target_range.start],
+            other_limbs[source_range.start] & min_mask,
+            c,
+        );
+        self._limbs_mut()[target_range.start] =
+            limb::reduce::<P>(self._limbs_mut()[target_range.start]);
+
+        let target_inner_range = self._len().limb_range_inner();
+        let source_inner_range = other._len().limb_range_inner();
+        if !source_inner_range.is_empty() {
+            limb::add_all::<P>(
+                &mut self._limbs_mut()[target_inner_range],
+                &other_limbs[source_inner_range],
+                c,
+            );
+        }
+        if source_range.len() > 1 {
+            // The first and last limbs are distinct, so we process the last.
+            self._limbs_mut()[target_range.end - 1] = limb::add::<P>(
+                self._limbs_mut()[target_range.end - 1],
+                other_limbs[source_range.end - 1] & max_mask,
+                c,
+            );
+            self._limbs_mut()[target_range.end - 1] =
+                limb::reduce::<P>(self._limbs_mut()[target_range.end - 1]);
+        }
+    }
+
+    fn _add_shift_left<T: InternalBaseVectorP<P>>(&mut self, other: T, c: u32) {
+        struct AddShiftLeftData {
+            offset_shift: usize,
+            tail_shift: usize,
+            zero_bits: usize,
+            min_source_limb: usize,
+            min_target_limb: usize,
+            number_of_source_limbs: usize,
+            number_of_target_limbs: usize,
+            min_mask: Limb,
+            max_mask: Limb,
+        }
+
+        impl AddShiftLeftData {
+            fn new<T, S, const P: u32>(target: T, source: S) -> Self
+            where
+                T: InternalBaseVectorP<P>,
+                S: InternalBaseVectorP<P>,
+            {
+                debug_assert!(target._prime() == source._prime());
+                debug_assert!(target._len().bit_offset() <= source._len().bit_offset());
+                debug_assert!(
+                    target._len().logical() == source._len().logical(),
+                    "self.dim {} not equal to other.dim {}",
+                    target._len().logical(),
+                    source._len().logical()
+                );
+                let offset_shift = source._len().bit_offset() - target._len().bit_offset();
+                let bit_length = limb::bit_length_const::<P>();
+                let entries_per_limb = limb::entries_per_limb_const::<P>();
+                let usable_bits_per_limb = bit_length * entries_per_limb;
+                let tail_shift = usable_bits_per_limb - offset_shift;
+                let zero_bits = constants::BITS_PER_LIMB - usable_bits_per_limb;
+                let source_range = source._len().limb_range();
+                let target_range = target._len().limb_range();
+                let min_source_limb = source_range.start;
+                let min_target_limb = target_range.start;
+                let number_of_source_limbs = source_range.len();
+                let number_of_target_limbs = target_range.len();
+                let (min_mask, max_mask) = source._len().limb_masks();
+
+                Self {
+                    offset_shift,
+                    tail_shift,
+                    zero_bits,
+                    min_source_limb,
+                    min_target_limb,
+                    number_of_source_limbs,
+                    number_of_target_limbs,
+                    min_mask,
+                    max_mask,
+                }
+            }
+
+            fn mask_first_limb<T: InternalBaseVectorP<P>, const P: u32>(
+                &self,
+                other: T,
+                i: usize,
+            ) -> Limb {
+                (other._limbs()[i] & self.min_mask) >> self.offset_shift
+            }
+
+            fn mask_middle_limb_a<T: InternalBaseVectorP<P>, const P: u32>(
+                &self,
+                other: T,
+                i: usize,
+            ) -> Limb {
+                other._limbs()[i] >> self.offset_shift
+            }
+
+            fn mask_middle_limb_b<T: InternalBaseVectorP<P>, const P: u32>(
+                &self,
+                other: T,
+                i: usize,
+            ) -> Limb {
+                (other._limbs()[i] << (self.tail_shift + self.zero_bits)) >> self.zero_bits
+            }
+
+            fn mask_last_limb_a<T: InternalBaseVectorP<P>, const P: u32>(
+                &self,
+                other: T,
+                i: usize,
+            ) -> Limb {
+                let source_limb_masked = other._limbs()[i] & self.max_mask;
+                source_limb_masked << self.tail_shift
+            }
+
+            fn mask_last_limb_b<T: InternalBaseVectorP<P>, const P: u32>(
+                &self,
+                other: T,
+                i: usize,
+            ) -> Limb {
+                let source_limb_masked = other._limbs()[i] & self.max_mask;
+                source_limb_masked >> self.offset_shift
+            }
+        }
+
+        let dat = AddShiftLeftData::new(&self, &other);
+        let mut i = 0;
+        let limbs_mut = self._limbs_mut();
+
+        {
+            limbs_mut[i + dat.min_target_limb] = limb::add::<P>(
+                limbs_mut[i + dat.min_target_limb],
+                dat.mask_first_limb(&other, i + dat.min_source_limb),
+                c,
+            );
+        }
+        for i in 1..dat.number_of_source_limbs - 1 {
+            limbs_mut[i + dat.min_target_limb] = limb::add::<P>(
+                limbs_mut[i + dat.min_target_limb],
+                dat.mask_middle_limb_a(&other, i + dat.min_source_limb),
+                c,
+            );
+            limbs_mut[i + dat.min_target_limb - 1] = limb::add::<P>(
+                limbs_mut[i + dat.min_target_limb - 1],
+                dat.mask_middle_limb_b(&other, i + dat.min_source_limb),
+                c,
+            );
+            limbs_mut[i + dat.min_target_limb - 1] =
+                limb::reduce::<P>(limbs_mut[i + dat.min_target_limb - 1]);
+        }
+        i = dat.number_of_source_limbs - 1;
+        if i > 0 {
+            limbs_mut[i + dat.min_target_limb - 1] = limb::add::<P>(
+                limbs_mut[i + dat.min_target_limb - 1],
+                dat.mask_last_limb_a(&other, i + dat.min_source_limb),
+                c,
+            );
+            limbs_mut[i + dat.min_target_limb - 1] =
+                limb::reduce::<P>(limbs_mut[i + dat.min_target_limb - 1]);
+            if dat.number_of_source_limbs == dat.number_of_target_limbs {
+                limbs_mut[i + dat.min_target_limb] = limb::add::<P>(
+                    limbs_mut[i + dat.min_target_limb],
+                    dat.mask_last_limb_b(&other, i + dat.min_source_limb),
+                    c,
+                );
+                limbs_mut[i + dat.min_target_limb] =
+                    limb::reduce::<P>(limbs_mut[i + dat.min_target_limb]);
+            }
+        } else {
+            limbs_mut[i + dat.min_target_limb] =
+                limb::reduce::<P>(limbs_mut[i + dat.min_target_limb]);
+        }
+    }
+
+    fn _add_shift_right<T: InternalBaseVectorP<P>>(&mut self, other: T, c: u32) {
+        struct AddShiftRightData {
+            offset_shift: usize,
+            tail_shift: usize,
+            zero_bits: usize,
+            min_source_limb: usize,
+            min_target_limb: usize,
+            number_of_source_limbs: usize,
+            number_of_target_limbs: usize,
+            min_mask: Limb,
+            max_mask: Limb,
+        }
+
+        impl AddShiftRightData {
+            fn new<T, S, const P: u32>(target: T, source: S) -> Self
+            where
+                T: InternalBaseVectorP<P>,
+                S: InternalBaseVectorP<P>,
+            {
+                debug_assert!(target._prime() == source._prime());
+                debug_assert!(target._len().bit_offset() >= source._len().bit_offset());
+                debug_assert!(
+                    target._len().logical() == source._len().logical(),
+                    "self.dim {} not equal to other.dim {}",
+                    target._len().logical(),
+                    source._len().logical()
+                );
+                let offset_shift = target._len().bit_offset() - source._len().bit_offset();
+                let bit_length = limb::bit_length_const::<P>();
+                let entries_per_limb = limb::entries_per_limb_const::<P>();
+                let usable_bits_per_limb = bit_length * entries_per_limb;
+                let tail_shift = usable_bits_per_limb - offset_shift;
+                let zero_bits = constants::BITS_PER_LIMB - usable_bits_per_limb;
+                let source_range = source._len().limb_range();
+                let target_range = target._len().limb_range();
+                let min_source_limb = source_range.start;
+                let min_target_limb = target_range.start;
+                let number_of_source_limbs = source_range.len();
+                let number_of_target_limbs = target_range.len();
+                let (min_mask, max_mask) = source._len().limb_masks();
+                Self {
+                    offset_shift,
+                    tail_shift,
+                    zero_bits,
+                    min_source_limb,
+                    min_target_limb,
+                    number_of_source_limbs,
+                    number_of_target_limbs,
+                    min_mask,
+                    max_mask,
+                }
+            }
+
+            fn mask_first_limb_a<T: InternalBaseVectorP<P>, const P: u32>(
+                &self,
+                other: T,
+                i: usize,
+            ) -> Limb {
+                let source_limb_masked = other._limbs()[i] & self.min_mask;
+                (source_limb_masked << (self.offset_shift + self.zero_bits)) >> self.zero_bits
+            }
+
+            fn mask_first_limb_b<T: InternalBaseVectorP<P>, const P: u32>(
+                &self,
+                other: T,
+                i: usize,
+            ) -> Limb {
+                let source_limb_masked = other._limbs()[i] & self.min_mask;
+                source_limb_masked >> self.tail_shift
+            }
+
+            fn mask_middle_limb_a<T: InternalBaseVectorP<P>, const P: u32>(
+                &self,
+                other: T,
+                i: usize,
+            ) -> Limb {
+                (other._limbs()[i] << (self.offset_shift + self.zero_bits)) >> self.zero_bits
+            }
+
+            fn mask_middle_limb_b<T: InternalBaseVectorP<P>, const P: u32>(
+                &self,
+                other: T,
+                i: usize,
+            ) -> Limb {
+                other._limbs()[i] >> self.tail_shift
+            }
+
+            fn mask_last_limb_a<T: InternalBaseVectorP<P>, const P: u32>(
+                &self,
+                other: T,
+                i: usize,
+            ) -> Limb {
+                let source_limb_masked = other._limbs()[i] & self.max_mask;
+                source_limb_masked << self.offset_shift
+            }
+
+            fn mask_last_limb_b<T: InternalBaseVectorP<P>, const P: u32>(
+                &self,
+                other: T,
+                i: usize,
+            ) -> Limb {
+                let source_limb_masked = other._limbs()[i] & self.max_mask;
+                source_limb_masked >> self.tail_shift
+            }
+        }
+
+        let dat = AddShiftRightData::new(&self, &other);
+        let mut i = 0;
+        let limbs_mut = self._limbs_mut();
+
+        {
+            limbs_mut[i + dat.min_target_limb] = limb::add::<P>(
+                limbs_mut[i + dat.min_target_limb],
+                dat.mask_first_limb_a(&other, i + dat.min_source_limb),
+                c,
+            );
+            limbs_mut[i + dat.min_target_limb] =
+                limb::reduce::<P>(limbs_mut[i + dat.min_target_limb]);
+            if dat.number_of_target_limbs > 1 {
+                limbs_mut[i + dat.min_target_limb + 1] = limb::add::<P>(
+                    limbs_mut[i + dat.min_target_limb + 1],
+                    dat.mask_first_limb_b(&other, i + dat.min_source_limb),
+                    c,
+                );
+            }
+        }
+        for i in 1..dat.number_of_source_limbs - 1 {
+            limbs_mut[i + dat.min_target_limb] = limb::add::<P>(
+                limbs_mut[i + dat.min_target_limb],
+                dat.mask_middle_limb_a(&other, i + dat.min_source_limb),
+                c,
+            );
+            limbs_mut[i + dat.min_target_limb] =
+                limb::reduce::<P>(limbs_mut[i + dat.min_target_limb]);
+            limbs_mut[i + dat.min_target_limb + 1] = limb::add::<P>(
+                limbs_mut[i + dat.min_target_limb + 1],
+                dat.mask_middle_limb_b(&other, i + dat.min_source_limb),
+                c,
+            );
+        }
+        i = dat.number_of_source_limbs - 1;
+        if i > 0 {
+            limbs_mut[i + dat.min_target_limb] = limb::add::<P>(
+                limbs_mut[i + dat.min_target_limb],
+                dat.mask_last_limb_a(&other, i + dat.min_source_limb),
+                c,
+            );
+            limbs_mut[i + dat.min_target_limb] =
+                limb::reduce::<P>(limbs_mut[i + dat.min_target_limb]);
+            if dat.number_of_target_limbs > dat.number_of_source_limbs {
+                limbs_mut[i + dat.min_target_limb + 1] = limb::add::<P>(
+                    limbs_mut[i + dat.min_target_limb + 1],
+                    dat.mask_last_limb_b(&other, i + dat.min_source_limb),
+                    c,
+                );
+            }
+        }
+        if dat.number_of_target_limbs > dat.number_of_source_limbs {
+            limbs_mut[i + dat.min_target_limb + 1] =
+                limb::reduce::<P>(limbs_mut[i + dat.min_target_limb + 1]);
+        }
+    }
+
+    /// Given a mask v, add the `v[i]`th entry of `other` to the `i`th entry of `self`.
+    fn _add_masked<T: InternalBaseVectorP<P>>(&mut self, other: T, c: u32, mask: &[usize]) {
+        // TODO: If this ends up being a bottleneck, try to use PDEP/PEXT
+        assert_eq!(self._as_slice()._len().logical(), mask.len());
+        for (i, &x) in mask.iter().enumerate() {
+            let entry = other._entry(x);
+            if entry != 0 {
+                self._add_basis_element(i, entry * c);
+            }
+        }
+    }
+
+    /// Given a mask v, add the `i`th entry of `other` to the `v[i]`th entry of `self`.
+    fn _add_unmasked<T: InternalBaseVectorP<P>>(&mut self, other: T, c: u32, mask: &[usize]) {
+        assert!(other._len().logical() <= mask.len());
+        for (i, v) in other._iter_nonzero() {
+            self._add_basis_element(mask[i], v * c);
+        }
+    }
+
+    fn _add_basis_element(&mut self, index: usize, value: u32) {
+        if P == 2 {
+            // Checking for value % 2 == 0 appears to be less performant
+            let pair = limb::limb_bit_index_pair::<2>(index + self._len().start);
+            self._limbs_mut()[pair.limb] ^= (value as Limb % 2) << pair.bit_index;
+        } else {
+            let mut x = self._entry(index);
+            x += value;
+            x %= P;
+            self._set_entry(index, x);
+        }
+    }
+
+    fn _add_offset<T: InternalBaseVectorP<P>>(&mut self, _other: T, _c: u32, _offset: usize) {
+        todo!();
+    }
+
+    fn _set_entry(&mut self, index: usize, value: u32) {
+        debug_assert!(index < self._len().logical());
+        let bit_mask = limb::bitmask::<P>();
+        let limb_index = limb::limb_bit_index_pair::<P>(index + self._len().start);
+        let mut result = self._limbs()[limb_index.limb];
+        result &= !(bit_mask << limb_index.bit_index);
+        result |= (value as Limb) << limb_index.bit_index;
+        self._limbs_mut()[limb_index.limb] = result;
+    }
+
+    fn _set_to_zero(&mut self) {
+        let limb_range = self._len().limb_range();
+        if limb_range.is_empty() {
+            return;
+        }
+        let (min_mask, max_mask) = self._len().limb_masks();
+        self._limbs_mut()[limb_range.start] &= !min_mask;
+
+        let inner_range = self._len().limb_range_inner();
+        for limb in &mut self._limbs_mut()[inner_range] {
+            *limb = 0;
+        }
+        self._limbs_mut()[limb_range.end - 1] &= !max_mask;
+    }
+
+    fn _reduce_limbs(&mut self) {
+        if P != 2 {
+            let limb_range = self._len().limb_range();
+
+            for limb in &mut self._limbs_mut()[limb_range] {
+                *limb = limb::reduce::<P>(*limb);
+            }
+        }
+    }
+
+    fn _scale(&mut self, c: u32) {
+        if P == 2 {
+            if c == 0 {
+                self._set_to_zero();
+            }
+            return;
+        }
+
+        let c = c as Limb;
+        let limb_range = self._len().limb_range();
+        if limb_range.is_empty() {
+            return;
+        }
+        let (min_mask, max_mask) = self._len().limb_masks();
+
+        let limb = self._limbs()[limb_range.start];
+        let masked_limb = limb & min_mask;
+        let rest_limb = limb & !min_mask;
+        self._limbs_mut()[limb_range.start] = (masked_limb * c) | rest_limb;
+
+        let inner_range = self._len().limb_range_inner();
+        for limb in &mut self._limbs_mut()[inner_range] {
+            *limb *= c;
+        }
+        if limb_range.len() > 1 {
+            let full_limb = self._limbs()[limb_range.end - 1];
+            let masked_limb = full_limb & max_mask;
+            let rest_limb = full_limb & !max_mask;
+            self._limbs_mut()[limb_range.end - 1] = (masked_limb * c) | rest_limb;
+        }
+        self._reduce_limbs();
+    }
+
+    fn _assign<T: InternalBaseVectorP<P>>(&mut self, other: T) {
+        debug_assert_eq!(self._len().logical(), other._len().logical());
+        if self._len().bit_offset() != other._len().bit_offset() {
+            self._set_to_zero();
+            self._add(other, 1);
+            return;
+        }
+        let target_range = self._len().limb_range();
+        let source_range = other._len().limb_range();
+
+        if target_range.is_empty() {
+            return;
+        }
+
+        let (min_mask, max_mask) = other._len().limb_masks();
+
+        let result = other._limbs()[source_range.start] & min_mask;
+        self._limbs_mut()[target_range.start] &= !min_mask;
+        self._limbs_mut()[target_range.start] |= result;
+
+        let target_inner_range = self._len().limb_range_inner();
+        let source_inner_range = other._len().limb_range_inner();
+        if !target_inner_range.is_empty() && !source_inner_range.is_empty() {
+            self._limbs_mut()[target_inner_range]
+                .clone_from_slice(&other._limbs()[source_inner_range]);
+        }
+
+        let result = other._limbs()[source_range.end - 1] & max_mask;
+        self._limbs_mut()[target_range.end - 1] &= !max_mask;
+        self._limbs_mut()[target_range.end - 1] |= result;
+    }
+
+    /// This replaces the contents of the vector with the contents of the slice. The two must have
+    /// the same length.
+    ///
+    /// This method is only implemented on `FpVectorP` right now. This is the only use case so far,
+    /// so I don't feel too bad about marking it as unimplemented in the general case.
+    fn _copy_from_slice(&mut self, _slice: &[u32]) {
+        unimplemented!();
+    }
+
+    /// This method is only implemented on `FpVectorP` right now. This is the only use case so far,
+    /// so I don't feel too bad about marking it as unimplemented in the general case.
+    fn _add_truncate<T: InternalBaseVectorP<P>>(&mut self, _other: T, _c: u32) -> Option<()> {
+        unimplemented!();
+    }
+}
diff --git a/ext/crates/fp/src/vector.rs b/ext/crates/fp/src/vector/mod.rs
similarity index 64%
rename from ext/crates/fp/src/vector.rs
rename to ext/crates/fp/src/vector/mod.rs
index 540b358ac3..1ecf69a63b 100644
--- a/ext/crates/fp/src/vector.rs
+++ b/ext/crates/fp/src/vector/mod.rs
@@ -1,491 +1,39 @@
-//! This module is provides wrappers around the contents of [`crate::vector_inner`]. The main
-//! purpose is to put [`FpVectorP`] for different `p` into a single enum. It does the same for the
-//! various slice structs.
-//!
-//! The main magic occurs in the macro `dispatch_vector_inner`, which we use to provide wrapper
-//! functions around the `FpVectorP` functions.
-//!
-//! This module is only used when the `odd-primes` feature is enabled.
-
-use crate::limb::{entries_per_limb, Limb};
-use crate::prime::ValidPrime;
-use crate::vector_inner::{
-    FpVectorIterator, FpVectorNonZeroIteratorP, FpVectorP, SliceMutP, SliceP,
-};
-use itertools::Itertools;
-#[cfg(feature = "json")]
-use serde::{Deserialize, Deserializer, Serialize, Serializer};
-
-use std::convert::TryInto;
-use std::io::{Read, Write};
-use std::mem::size_of;
-
-macro_rules! dispatch_vector_inner {
-    // other is a type, but marking it as a :ty instead of :tt means we cannot use it to access its
-    // enum variants.
-    ($vis:vis fn $method:ident(&self, other: &$other:tt $(, $arg:ident: $ty:ty )* ) $(-> $ret:ty)?) => {
-        $vis fn $method(&self, other: &$other, $($arg: $ty),* ) $(-> $ret)* {
-            match (self, other) {
-                (Self::_2(x), $other::_2(y)) => x.$method(y, $($arg),*),
-                (Self::_3(x), $other::_3(y)) => x.$method(y, $($arg),*),
-                (Self::_5(x), $other::_5(y)) => x.$method(y, $($arg),*),
-                (Self::_7(x), $other::_7(y)) => x.$method(y, $($arg),*),
-                (l, r) => {
-                    panic!("Applying {} to vectors over different primes ({} and {})", stringify!($method), l.prime(), r.prime());
-                }
-            }
-        }
-    };
-    ($vis:vis fn $method:ident(&mut self, other: &$other:tt $(, $arg:ident: $ty:ty )* ) $(-> $ret:ty)?) => {
-        #[allow(unused_parens)]
-        $vis fn $method(&mut self, other: &$other, $($arg: $ty),* ) $(-> $ret)* {
-            match (self, other) {
-                (Self::_2(x), $other::_2(y)) => x.$method(y, $($arg),*),
-                (Self::_3(x), $other::_3(y)) => x.$method(y, $($arg),*),
-                (Self::_5(x), $other::_5(y)) => x.$method(y, $($arg),*),
-                (Self::_7(x), $other::_7(y)) => x.$method(y, $($arg),*),
-                (l, r) => {
-                    panic!("Applying {} to vectors over different primes ({} and {})", stringify!($method), l.prime(), r.prime());
-                }
-            }
-        }
-    };
-    ($vis:vis fn $method:ident(&mut self, other: $other:tt $(, $arg:ident: $ty:ty )* ) $(-> $ret:ty)?) => {
-        $vis fn $method(&mut self, other: $other, $($arg: $ty),* ) $(-> $ret)* {
-            match (self, other) {
-                (Self::_2(x), $other::_2(y)) => x.$method(y, $($arg),*),
-                (Self::_3(x), $other::_3(y)) => x.$method(y, $($arg),*),
-                (Self::_5(x), $other::_5(y)) => x.$method(y, $($arg),*),
-                (Self::_7(x), $other::_7(y)) => x.$method(y, $($arg),*),
-                (l, r) => {
-                    panic!("Applying {} to vectors over different primes ({} and {})", stringify!($method), l.prime(), r.prime());
-                }
-            }
-        }
-    };
-    ($vis:vis fn $method:ident(&mut self $(, $arg:ident: $ty:ty )* ) -> (dispatch $ret:tt)) => {
-        #[must_use]
-        $vis fn $method(&mut self, $($arg: $ty),* ) -> $ret {
-            match self {
-                Self::_2(x) => $ret::_2(x.$method($($arg),*)),
-                Self::_3(x) => $ret::_3(x.$method($($arg),*)),
-                Self::_5(x) => $ret::_5(x.$method($($arg),*)),
-                Self::_7(x) => $ret::_7(x.$method($($arg),*)),
-            }
-        }
-    };
-    ($vis:vis fn $method:ident(&self $(, $arg:ident: $ty:ty )* ) -> (dispatch $ret:tt)) => {
-        #[must_use]
-        $vis fn $method(&self, $($arg: $ty),* ) -> $ret {
-            match self {
-                Self::_2(x) => $ret::_2(x.$method($($arg),*)),
-                Self::_3(x) => $ret::_3(x.$method($($arg),*)),
-                Self::_5(x) => $ret::_5(x.$method($($arg),*)),
-                Self::_7(x) => $ret::_7(x.$method($($arg),*)),
-            }
-        }
-    };
-    ($vis:vis fn $method:ident(self $(, $arg:ident: $ty:ty )* ) -> (dispatch $ret:tt)) => {
-        #[must_use]
-        $vis fn $method(self, $($arg: $ty),* ) -> $ret {
-            match self {
-                Self::_2(x) => $ret::_2(x.$method($($arg),*)),
-                Self::_3(x) => $ret::_3(x.$method($($arg),*)),
-                Self::_5(x) => $ret::_5(x.$method($($arg),*)),
-                Self::_7(x) => $ret::_7(x.$method($($arg),*)),
-            }
-        }
-    };
-
-    ($vis:vis fn $method:ident(self $(, $arg:ident: $ty:ty )* ) -> (dispatch $ret:tt $lifetime:tt)) => {
-        #[must_use]
-        $vis fn $method(self, $($arg: $ty),* ) -> $ret<$lifetime> {
-            match self {
-                Self::_2(x) => $ret::_2(x.$method($($arg),*)),
-                Self::_3(x) => $ret::_3(x.$method($($arg),*)),
-                Self::_5(x) => $ret::_5(x.$method($($arg),*)),
-                Self::_7(x) => $ret::_7(x.$method($($arg),*)),
-            }
-        }
-    };
-
-    ($vis:vis fn $method:ident(&mut self $(, $arg:ident: $ty:ty )* ) $(-> $ret:ty)?) => {
-        #[allow(unused_parens)]
-        $vis fn $method(&mut self, $($arg: $ty),* ) $(-> $ret)* {
-            match self {
-                Self::_2(x) => x.$method($($arg),*),
-                Self::_3(x) => x.$method($($arg),*),
-                Self::_5(x) => x.$method($($arg),*),
-                Self::_7(x) => x.$method($($arg),*),
-            }
-        }
-    };
-    ($vis:vis fn $method:ident(&self $(, $arg:ident: $ty:ty )* ) $(-> $ret:ty)?) => {
-        #[allow(unused_parens)]
-        $vis fn $method(&self, $($arg: $ty),* ) $(-> $ret)* {
-            match self {
-                Self::_2(x) => x.$method($($arg),*),
-                Self::_3(x) => x.$method($($arg),*),
-                Self::_5(x) => x.$method($($arg),*),
-                Self::_7(x) => x.$method($($arg),*),
-            }
-        }
-    };
-    ($vis:vis fn $method:ident(self $(, $arg:ident: $ty:ty )* ) $(-> $ret:ty)?) => {
-        #[allow(unused_parens)]
-        $vis fn $method(self, $($arg: $ty),* ) $(-> $ret)* {
-            match self {
-                Self::_2(x) => x.$method($($arg),*),
-                Self::_3(x) => x.$method($($arg),*),
-                Self::_5(x) => x.$method($($arg),*),
-                Self::_7(x) => x.$method($($arg),*),
-            }
-        }
-    }
-}
-
-macro_rules! dispatch_vector {
-    () => {};
-    ($vis:vis fn $method:ident $tt:tt $(-> $ret:tt)?; $($tail:tt)*) => {
-        dispatch_vector_inner! {
-            $vis fn $method $tt $(-> $ret)*
-        }
-        dispatch_vector!{$($tail)*}
-    }
-}
-
-macro_rules! match_p {
-    ($p:ident, $($val:tt)*) => {
-        match *$p {
-            2 => Self::_2($($val)*),
-            3 => Self::_3($($val)*),
-            5 => Self::_5($($val)*),
-            7 => Self::_7($($val)*),
-            _ => panic!("Prime not supported: {}", *$p)
-        }
-    }
-}
-
-#[derive(Debug, Hash, Eq, PartialEq, Clone)]
-pub enum FpVector {
-    _2(FpVectorP<2>),
-    _3(FpVectorP<3>),
-    _5(FpVectorP<5>),
-    _7(FpVectorP<7>),
-}
-
-#[derive(Debug, Copy, Clone)]
-pub enum Slice<'a> {
-    _2(SliceP<'a, 2>),
-    _3(SliceP<'a, 3>),
-    _5(SliceP<'a, 5>),
-    _7(SliceP<'a, 7>),
-}
-
-#[derive(Debug)]
-pub enum SliceMut<'a> {
-    _2(SliceMutP<'a, 2>),
-    _3(SliceMutP<'a, 3>),
-    _5(SliceMutP<'a, 5>),
-    _7(SliceMutP<'a, 7>),
-}
-
-pub enum FpVectorNonZeroIterator<'a> {
-    _2(FpVectorNonZeroIteratorP<'a, 2>),
-    _3(FpVectorNonZeroIteratorP<'a, 3>),
-    _5(FpVectorNonZeroIteratorP<'a, 5>),
-    _7(FpVectorNonZeroIteratorP<'a, 7>),
-}
-
-impl FpVector {
-    pub fn new(p: ValidPrime, len: usize) -> FpVector {
-        match_p!(p, FpVectorP::new_(len))
-    }
-
-    pub fn new_with_capacity(p: ValidPrime, len: usize, capacity: usize) -> FpVector {
-        match_p!(p, FpVectorP::new_with_capacity_(len, capacity))
-    }
-
-    pub fn from_slice(p: ValidPrime, slice: &[u32]) -> Self {
-        match_p!(p, FpVectorP::from(&slice))
-    }
-
-    pub fn num_limbs(p: ValidPrime, len: usize) -> usize {
-        let entries_per_limb = entries_per_limb(p);
-        (len + entries_per_limb - 1) / entries_per_limb
-    }
-    pub(crate) fn padded_len(p: ValidPrime, len: usize) -> usize {
-        Self::num_limbs(p, len) * entries_per_limb(p)
-    }
-
-    pub fn update_from_bytes(&mut self, data: &mut impl Read) -> std::io::Result<()> {
-        let limbs = self.limbs_mut();
-        let num_limbs = limbs.len();
-
-        if cfg!(target_endian = "little") {
-            let num_bytes = num_limbs * size_of::<Limb>();
-            unsafe {
-                let buf: &mut [u8] =
-                    std::slice::from_raw_parts_mut(limbs.as_mut_ptr() as *mut u8, num_bytes);
-                data.read_exact(buf).unwrap();
-            }
-        } else {
-            for entry in limbs {
-                let mut bytes: [u8; size_of::<Limb>()] = [0; size_of::<Limb>()];
-                data.read_exact(&mut bytes)?;
-                *entry = Limb::from_le_bytes(bytes);
-            }
-        };
-        Ok(())
-    }
-
-    pub fn from_bytes(p: ValidPrime, len: usize, data: &mut impl Read) -> std::io::Result<Self> {
-        let mut v = Self::new(p, len);
-        v.update_from_bytes(data)?;
-        Ok(v)
-    }
-
-    pub fn to_bytes(&self, buffer: &mut impl Write) -> std::io::Result<()> {
-        // self.limbs is allowed to have more limbs than necessary, but we only save the
-        // necessary ones.
-        let num_limbs = Self::num_limbs(self.prime(), self.len());
-
-        if cfg!(target_endian = "little") {
-            let num_bytes = num_limbs * size_of::<Limb>();
-            unsafe {
-                let buf: &[u8] =
-                    std::slice::from_raw_parts_mut(self.limbs().as_ptr() as *mut u8, num_bytes);
-                buffer.write_all(buf)?;
-            }
-        } else {
-            for limb in &self.limbs()[0..num_limbs] {
-                let bytes = limb.to_le_bytes();
-                buffer.write_all(&bytes)?;
-            }
-        }
-        Ok(())
-    }
-
-    dispatch_vector! {
-        pub fn prime(&self) -> ValidPrime;
-        pub fn len(&self) -> usize;
-        pub fn is_empty(&self) -> bool;
-        pub fn scale(&mut self, c: u32);
-        pub fn set_to_zero(&mut self);
-        pub fn entry(&self, index: usize) -> u32;
-        pub fn set_entry(&mut self, index: usize, value: u32);
-        pub fn assign(&mut self, other: &Self);
-        pub fn assign_partial(&mut self, other: &Self);
-        pub fn add(&mut self, other: &Self, c: u32);
-        pub fn add_nosimd(&mut self, other: &Self, c: u32);
-        pub fn add_offset(&mut self, other: &Self, c: u32, offset: usize);
-        pub fn add_offset_nosimd(&mut self, other: &Self, c: u32, offset: usize);
-        pub fn slice(&self, start: usize, end: usize) -> (dispatch Slice);
-        pub fn as_slice(&self) -> (dispatch Slice);
-        pub fn slice_mut(&mut self, start: usize, end: usize) -> (dispatch SliceMut);
-        pub fn as_slice_mut(&mut self) -> (dispatch SliceMut);
-        pub fn is_zero(&self) -> bool;
-        pub fn iter(&self) -> FpVectorIterator;
-        pub fn iter_nonzero(&self) -> (dispatch FpVectorNonZeroIterator);
-        pub fn extend_len(&mut self, dim: usize);
-        pub fn set_scratch_vector_size(&mut self, dim: usize);
-        pub fn add_basis_element(&mut self, index: usize, value: u32);
-        pub fn copy_from_slice(&mut self, slice: &[u32]);
-        pub(crate) fn trim_start(&mut self, n: usize);
-        pub fn add_truncate(&mut self, other: &Self, c: u32) -> (Option<()>);
-        pub fn sign_rule(&self, other: &Self) -> bool;
-        pub fn add_carry(&mut self, other: &Self, c: u32, rest: &mut [FpVector]) -> bool;
-        pub fn first_nonzero(&self) -> (Option<(usize, u32)>);
-        pub fn density(&self) -> f32;
-
-        pub(crate) fn limbs(&self) -> (&[Limb]);
-        pub(crate) fn limbs_mut(&mut self) -> (&mut [Limb]);
-    }
-}
-
-impl<'a> Slice<'a> {
-    dispatch_vector! {
-        pub fn prime(&self) -> ValidPrime;
-        pub fn len(&self) -> usize;
-        pub fn is_empty(&self) -> bool;
-        pub fn entry(&self, index: usize) -> u32;
-        pub fn iter(self) -> (FpVectorIterator<'a>);
-        pub fn iter_nonzero(self) -> (dispatch FpVectorNonZeroIterator 'a);
-        pub fn is_zero(&self) -> bool;
-        pub fn slice(self, start: usize, end: usize) -> (dispatch Slice 'a);
-        pub fn to_owned(self) -> (dispatch FpVector);
-    }
-}
-
-impl<'a> SliceMut<'a> {
-    dispatch_vector! {
-        pub fn prime(&self) -> ValidPrime;
-        pub fn scale(&mut self, c: u32);
-        pub fn set_to_zero(&mut self);
-        pub fn add(&mut self, other: Slice, c: u32);
-        pub fn assign(&mut self, other: Slice);
-        pub fn set_entry(&mut self, index: usize, value: u32);
-        pub fn as_slice(&self) -> (dispatch Slice);
-        pub fn slice_mut(&mut self, start: usize, end: usize) -> (dispatch SliceMut);
-        pub fn add_basis_element(&mut self, index: usize, value: u32);
-        pub fn copy(&mut self) -> (dispatch SliceMut);
-        pub fn add_masked(&mut self, other: Slice, c: u32, mask: &[usize]);
-        pub fn add_unmasked(&mut self, other: Slice, c: u32, mask: &[usize]);
-    }
-
-    pub fn add_tensor(&mut self, offset: usize, coeff: u32, left: Slice, right: Slice) {
-        match (self, left, right) {
-            (SliceMut::_2(x), Slice::_2(y), Slice::_2(z)) => x.add_tensor(offset, coeff, y, z),
-            (SliceMut::_3(x), Slice::_3(y), Slice::_3(z)) => x.add_tensor(offset, coeff, y, z),
-            (SliceMut::_5(x), Slice::_5(y), Slice::_5(z)) => x.add_tensor(offset, coeff, y, z),
-            (SliceMut::_7(x), Slice::_7(y), Slice::_7(z)) => x.add_tensor(offset, coeff, y, z),
-            _ => {
-                panic!("Applying add_tensor to vectors over different primes");
-            }
-        }
-    }
-}
-
-impl<'a> FpVectorNonZeroIterator<'a> {
-    dispatch_vector! {
-        fn next(&mut self) -> (Option<(usize, u32)>);
-    }
-}
-
-impl std::fmt::Display for FpVector {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        self.as_slice().fmt(f)
-    }
-}
-
-impl<'a> std::fmt::Display for Slice<'a> {
-    /// # Example
-    /// ```
-    /// # use fp::vector::FpVector;
-    /// # use fp::prime::ValidPrime;
-    /// let v = FpVector::from_slice(ValidPrime::new(2), &[0, 1, 0]);
-    /// assert_eq!(&format!("{v}"), "[0, 1, 0]");
-    /// assert_eq!(&format!("{v:#}"), "010");
-    /// ```
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        if f.alternate() {
-            for v in self.iter() {
-                write!(f, "{v}")?;
-            }
-            Ok(())
-        } else {
-            write!(f, "[{}]", self.iter().format(", "))
-        }
-    }
-}
-
-impl From<&FpVector> for Vec<u32> {
-    fn from(v: &FpVector) -> Vec<u32> {
-        v.iter().collect()
-    }
-}
-
-impl std::ops::AddAssign<&FpVector> for FpVector {
-    fn add_assign(&mut self, other: &FpVector) {
-        self.add(other, 1);
-    }
-}
-
-impl<'a> Iterator for FpVectorNonZeroIterator<'a> {
-    type Item = (usize, u32);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.next()
-    }
-}
-
-impl<'a> IntoIterator for &'a FpVector {
-    type IntoIter = FpVectorIterator<'a>;
-    type Item = u32;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.iter()
-    }
-}
-
-macro_rules! impl_try_into {
-    ($var:tt, $p:literal) => {
-        impl<'a> TryInto<&'a mut FpVectorP<$p>> for &'a mut FpVector {
-            type Error = ();
-
-            fn try_into(self) -> Result<&'a mut FpVectorP<$p>, ()> {
-                match self {
-                    FpVector::$var(x) => Ok(x),
-                    _ => Err(()),
-                }
-            }
-        }
+pub mod base_generic;
+pub mod generic;
+pub(crate) mod internal;
+
+#[cfg(feature = "odd-primes")]
+pub mod specialized;
+pub mod specialized_2;
+#[cfg(not(feature = "odd-primes"))]
+pub use specialized_2 as specialized;
+
+pub use specialized::{FpVector, FpVectorNonZeroIterator, Slice, SliceMut};
+
+// If odd-primes is disabled, the marker trait `BaseVector` cannot be meaningfully used without also
+// having `BaseVectorP` in scope. Instead of requiring several imports all over the codebase,
+// depending conditionally on feature flags, the prelude streamlines the process.
+
+pub mod prelude {
+    #[cfg(feature = "odd-primes")]
+    pub use super::specialized::{BaseVector, BaseVectorMut};
+    #[cfg(not(feature = "odd-primes"))]
+    pub use super::{
+        base_generic::{BaseVectorMutP, BaseVectorP},
+        specialized_2::{BaseVector, BaseVectorMut},
     };
 }
 
-impl_try_into!(_2, 2);
-impl_try_into!(_3, 3);
-impl_try_into!(_5, 5);
-impl_try_into!(_7, 7);
-
-#[cfg(feature = "json")]
-impl Serialize for FpVector {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        Vec::<u32>::from(self).serialize(serializer)
-    }
-}
-
-#[cfg(feature = "json")]
-impl<'de> Deserialize<'de> for FpVector {
-    fn deserialize<D>(_deserializer: D) -> Result<Self, D::Error>
-    where
-        D: Deserializer<'de>,
-    {
-        panic!("Deserializing FpVector not supported");
-        // This is needed for ext-websocket/actions to be happy
-    }
-}
-
-impl<'a, 'b> From<&'a mut SliceMut<'b>> for SliceMut<'a> {
-    fn from(slice: &'a mut SliceMut<'b>) -> SliceMut<'a> {
-        slice.copy()
-    }
-}
-
-impl<'a, 'b> From<&'a Slice<'b>> for Slice<'a> {
-    fn from(slice: &'a Slice<'b>) -> Slice<'a> {
-        *slice
-    }
-}
-
-impl<'a, 'b> From<&'a SliceMut<'b>> for Slice<'a> {
-    fn from(slice: &'a SliceMut<'b>) -> Slice<'a> {
-        slice.as_slice()
-    }
-}
-
-impl<'a> From<&'a FpVector> for Slice<'a> {
-    fn from(v: &'a FpVector) -> Slice<'a> {
-        v.as_slice()
-    }
-}
-
-impl<'a> From<&'a mut FpVector> for SliceMut<'a> {
-    fn from(v: &'a mut FpVector) -> SliceMut<'a> {
-        v.as_slice_mut()
-    }
-}
-
 #[cfg(test)]
 mod test {
-    use super::*;
-    use crate::limb;
+    use std::fmt::Write as _; // Needed for write! macro for String
+
+    use itertools::Itertools;
     use rand::Rng;
     use rstest::rstest;
-    use std::fmt::Write as _; // Needed for write! macro for String
+
+    use super::{prelude::*, FpVector};
+    use crate::{limb, prime::ValidPrime};
 
     pub struct VectorDiffEntry {
         pub index: usize,
@@ -567,6 +115,7 @@ mod test {
         result
     }
 
+    #[cfg(feature = "odd-primes")]
     macro_rules! test_dim {
         () => {};
         (fn $name:ident($p:ident: ValidPrime) $body:tt $($rest:tt)*) => {
@@ -610,6 +159,50 @@ mod test {
         };
     }
 
+    #[cfg(not(feature = "odd-primes"))]
+    macro_rules! test_dim {
+        () => {};
+        (fn $name:ident($p:ident: ValidPrime) $body:tt $($rest:tt)*) => {
+            #[rstest]
+            #[trace]
+            fn $name(#[values(2)] p: u32) {
+                let $p = ValidPrime::new(p);
+
+                $body
+            }
+            test_dim! { $($rest)* }
+        };
+        (fn $name:ident($p:ident: ValidPrime, $dim:ident: usize) $body:tt $($rest:tt)*) => {
+            #[rstest]
+            #[trace]
+            fn $name(#[values(2)] p: u32, #[values(10, 20, 70, 100, 1000)] $dim: usize) {
+                let $p = ValidPrime::new(p);
+
+                $body
+            }
+            test_dim! { $($rest)* }
+        };
+        (fn $name:ident($p:ident: ValidPrime, $dim:ident: usize, $slice_start:ident: usize, $slice_end:ident: usize) $body:tt $($rest:tt)*) => {
+            #[rstest]
+            #[trace]
+            fn $name(#[values(2)] p: u32, #[values(10, 20, 70, 100, 1000)] $dim: usize) {
+                let $p = ValidPrime::new(p);
+
+                let $slice_start = match $dim {
+                    10 => 5,
+                    20 => 10,
+                    70 => 20,
+                    100 => 30,
+                    1000 => 290,
+                    _ => unreachable!(),
+                };
+                let $slice_end = ($dim + $slice_start) / 2;
+                $body
+            }
+            test_dim! { $($rest)* }
+        };
+    }
+
     test_dim! {
         fn test_serialize(p: ValidPrime, dim: usize) {
             use std::io::{Seek, Cursor, SeekFrom};
@@ -625,6 +218,18 @@ mod test {
             v.assert_vec_eq(&w);
         }
 
+        fn test_is_zero(p: ValidPrime, dim: usize) {
+            let zero_vec = FpVector::from_slice(p, &vec![0; dim]);
+            let nonzero_vec = {
+                let mut v = random_vector(p, dim);
+                v[0] = 1;
+                FpVector::from_slice(p, &v)
+            };
+
+            assert!(zero_vec.is_zero());
+            assert!(!nonzero_vec.is_zero());
+        }
+
         fn test_add(p: ValidPrime, dim: usize) {
             let mut v_arr = random_vector(p, dim);
             let w_arr = random_vector(p, dim);
@@ -638,6 +243,51 @@ mod test {
             v.assert_list_eq(&v_arr);
         }
 
+        fn test_add_basis_element(p: ValidPrime, dim: usize, slice_start: usize, slice_end: usize) {
+            let mut v_arr = random_vector(p, dim);
+            let mut v = FpVector::from_slice(p, &v_arr);
+            let mut slice = v.slice_mut(slice_start, slice_end);
+
+            slice.add_basis_element(1, 1);
+            v_arr[slice_start + 1] += 1;
+            v_arr[slice_start + 1] %= *p;
+
+            v.assert_list_eq(&v_arr);
+        }
+
+        fn test_add_vector(p: ValidPrime, dim: usize, slice_start: usize, slice_end: usize) {
+            let slice_dim = slice_end - slice_start;
+            let mut v_arr = random_vector(p, slice_dim);
+            let w_arr = random_vector(p, dim);
+            let mut v = FpVector::from_slice(p, &v_arr);
+            let w = FpVector::from_slice(p, &w_arr);
+            let w_slice = w.slice(slice_start, slice_end);
+
+            v.add(&w_slice, 1);
+            for i in 0..slice_dim {
+                v_arr[i] = (v_arr[i] + w_arr[i + slice_start]) % *p;
+            }
+            v.assert_list_eq(&v_arr);
+        }
+
+        fn test_slice_of_slice(p: ValidPrime, dim: usize, slice_start: usize, slice_end: usize) {
+            let v_arr = random_vector(p, dim);
+            let v = FpVector::from_slice(p, &v_arr);
+            let slice = v.slice(slice_start, slice_end);
+
+            let half_length = (slice_end - slice_start) / 2;
+            let smaller_slice = slice.slice(0, half_length);
+
+            let mut diffs = Vec::new();
+            for (i, val) in smaller_slice.iter().enumerate() {
+                if v_arr[i + slice_start] != val {
+                    diffs.push((i, val, v.entry(i)));
+                }
+            }
+            assert_eq!(diffs, []);
+            assert_eq!(smaller_slice.len(), half_length);
+        }
+
         fn test_scale(p: ValidPrime, dim: usize) {
             let mut v_arr = random_vector(p, dim);
             let mut rng = rand::thread_rng();
@@ -677,24 +327,6 @@ mod test {
             assert_eq!(diffs, []);
         }
 
-        fn test_entry_slice(p: ValidPrime, dim: usize, slice_start: usize, slice_end: usize) {
-            let v_arr = random_vector(p, dim);
-            let v = FpVector::from_slice(p, &v_arr);
-            let v = v.slice(slice_start, slice_end);
-            println!(
-                "slice_start: {}, slice_end: {}, slice: {}",
-                slice_start, slice_end, v
-                );
-
-            let mut diffs = Vec::new();
-            for i in 0..v.len() {
-                if v.entry(i) != v_arr[i + slice_start] {
-                    diffs.push((i, v_arr[i + slice_start], v.entry(i)));
-                }
-            }
-            assert_eq!(diffs, []);
-        }
-
         fn test_set_entry(p: ValidPrime, dim: usize) {
             let mut v = FpVector::new(p, dim);
             let v_arr = random_vector(p, dim);
@@ -778,7 +410,7 @@ mod test {
             v.assign_partial(&w);
             assert!(v.slice(dim / 2, dim).is_zero());
             assert_eq!(v.len(), dim);
-            v.slice(0, dim / 2).to_owned().assert_vec_eq(&w);
+            v.slice(0, dim / 2).into_owned().assert_vec_eq(&w);
         }
 
         fn test_assign_slice_to_slice(p: ValidPrime, dim: usize, slice_start: usize, slice_end: usize) {
@@ -797,34 +429,34 @@ mod test {
             v.assert_list_eq(&v_arr);
         }
 
-        fn test_add_shift_right(p: ValidPrime, dim: usize, slice_start: usize, slice_end: usize) {
+        fn test_add_shift_left(p: ValidPrime, dim: usize, slice_start: usize, slice_end: usize) {
             let mut v_arr = random_vector(p, dim);
             let w_arr = random_vector(p, dim);
 
             let mut v = FpVector::from_slice(p, &v_arr);
             let w = FpVector::from_slice(p, &w_arr);
 
-            v.slice_mut(slice_start + 2, slice_end + 2)
+            v.slice_mut(slice_start - 2, slice_end - 2)
                 .add(w.slice(slice_start, slice_end), 1);
-
-            println!("v : {}", v);
-            for i in slice_start + 2..slice_end + 2 {
-                v_arr[i] = (v_arr[i] + w_arr[i - 2]) % *p;
+            for i in slice_start - 2..slice_end - 2 {
+                v_arr[i] = (v_arr[i] + w_arr[i + 2]) % *p;
             }
             v.assert_list_eq(&v_arr);
         }
 
-        fn test_add_shift_left(p: ValidPrime, dim: usize, slice_start: usize, slice_end: usize) {
+        fn test_add_shift_right(p: ValidPrime, dim: usize, slice_start: usize, slice_end: usize) {
             let mut v_arr = random_vector(p, dim);
             let w_arr = random_vector(p, dim);
 
             let mut v = FpVector::from_slice(p, &v_arr);
             let w = FpVector::from_slice(p, &w_arr);
 
-            v.slice_mut(slice_start - 2, slice_end - 2)
+            v.slice_mut(slice_start + 2, slice_end + 2)
                 .add(w.slice(slice_start, slice_end), 1);
-            for i in slice_start - 2..slice_end - 2 {
-                v_arr[i] = (v_arr[i] + w_arr[i + 2]) % *p;
+
+            println!("v : {}", v);
+            for i in slice_start + 2..slice_end + 2 {
+                v_arr[i] = (v_arr[i] + w_arr[i - 2]) % *p;
             }
             v.assert_list_eq(&v_arr);
         }
@@ -864,7 +496,7 @@ mod test {
         }
 
         fn test_iterator_slice(p: ValidPrime) {
-            let ep = entries_per_limb(p);
+            let ep = limb::entries_per_limb(p);
             for &dim in &[5, 10, ep, ep - 1, ep + 1, 3 * ep, 3 * ep - 1, 3 * ep + 1] {
                 let v_arr = random_vector(p, dim);
                 let v = FpVector::from_slice(p, &v_arr);
@@ -884,7 +516,7 @@ mod test {
         }
 
         fn test_iterator_skip(p: ValidPrime) {
-            let ep = entries_per_limb(p);
+            let ep = limb::entries_per_limb(p);
             let dim = 5 * ep;
             for &num_skip in &[ep, ep - 1, ep + 1, 3 * ep, 3 * ep - 1, 3 * ep + 1, 6 * ep] {
                 let v_arr = random_vector(p, dim);
@@ -906,7 +538,7 @@ mod test {
         }
 
         fn test_iterator(p: ValidPrime) {
-            let ep = entries_per_limb(p);
+            let ep = limb::entries_per_limb(p);
             for &dim in &[0, 5, 10, ep, ep - 1, ep + 1, 3 * ep, 3 * ep - 1, 3 * ep + 1] {
                 let v_arr = random_vector(p, dim);
                 let v = FpVector::from_slice(p, &v_arr);
@@ -1071,6 +703,10 @@ mod test {
     #[test]
     #[ignore]
     fn test_sign_rule() {
+        use super::{
+            base_generic::BaseVectorP, generic::FpVectorP, internal::InternalBaseVectorMutP,
+        };
+
         let mut in1 = FpVectorP::<2>::new_(128);
         let mut in2 = FpVectorP::<2>::new_(128);
         let tests = [
@@ -1477,10 +1113,10 @@ mod test {
         ];
         let mut diffs = Vec::new();
         for &(in1_limb1, in1_limb2, in2_limb1, in2_limb2, res1, res2) in tests.iter() {
-            in1.limbs_mut()[1] = in1_limb1;
-            in1.limbs_mut()[0] = in1_limb2;
-            in2.limbs_mut()[1] = in2_limb1;
-            in2.limbs_mut()[0] = in2_limb2;
+            in1._limbs_mut()[1] = in1_limb1;
+            in1._limbs_mut()[0] = in1_limb2;
+            in2._limbs_mut()[1] = in2_limb1;
+            in2._limbs_mut()[0] = in2_limb2;
             let test_res1 = in1.sign_rule(&in2);
             let test_res2 = in2.sign_rule(&in1);
             let res = (res1, res2);
diff --git a/ext/crates/fp/src/vector/specialized.rs b/ext/crates/fp/src/vector/specialized.rs
new file mode 100644
index 0000000000..00f4237cb1
--- /dev/null
+++ b/ext/crates/fp/src/vector/specialized.rs
@@ -0,0 +1,420 @@
+use std::io::{Read, Write};
+
+use itertools::Itertools;
+
+use super::{
+    base_generic::{BaseVectorMutP, BaseVectorP},
+    generic::{FpVectorIterator, FpVectorNonZeroIteratorP, FpVectorP, SliceMutP, SliceP},
+};
+use crate::{limb::Limb, prime::ValidPrime};
+
+dispatch_type!(
+    derive(Debug, Hash, Eq, PartialEq, Clone),
+    pub FpVector { FpVectorP }
+);
+
+dispatch_type!(
+    derive(Debug, Copy, Clone),
+    pub Slice<'a> { SliceP }
+);
+
+dispatch_type!(
+    derive(Debug),
+    pub SliceMut<'a> { SliceMutP }
+);
+
+dispatch_type!(
+    derive(),
+    pub FpVectorNonZeroIterator<'a> { FpVectorNonZeroIteratorP }
+);
+
+macro_rules! dispatch_basevector {
+    () => {
+        dispatch_prime! {
+            fn prime(&self) -> ValidPrime;
+            fn len(&self) -> usize;
+            fn is_empty(&self) -> bool;
+            fn entry(&self, index: usize) -> u32;
+            fn as_slice(&self) -> (dispatch Slice);
+            fn is_zero(&self) -> bool;
+            fn iter(&self) -> FpVectorIterator;
+            fn iter_nonzero(&self) -> (dispatch FpVectorNonZeroIterator);
+            fn first_nonzero(&self) -> (Option<(usize, u32)>);
+            fn sign_rule(&self, other: &Self) -> bool;
+            fn density(&self) -> f32;
+        }
+
+        fn slice<'b>(&self, start: usize, end: usize) -> Slice<'b>
+        where
+            Self: 'b,
+        {
+            match_self_p!(slice(&self, start, end) -> Slice)
+        }
+    };
+}
+
+macro_rules! dispatch_basevectormut {
+    () => {
+        dispatch_prime! {
+            fn scale(&mut self, c: u32);
+            fn set_to_zero(&mut self);
+            fn set_entry(&mut self, index: usize, value: u32);
+            fn slice_mut(&mut self, start: usize, end: usize) -> (dispatch SliceMut);
+            fn as_slice_mut(&mut self) -> (dispatch SliceMut);
+            fn add_basis_element(&mut self, index: usize, value: u32);
+            fn copy_from_slice(&mut self, slice: &[u32]);
+        }
+
+        dispatch_prime_generic! {
+            fn assign(&mut self);
+            fn add(&mut self, c: u32);
+            fn add_offset(&mut self, c: u32, offset: usize);
+            fn add_masked(&mut self, c: u32, mask: &[usize]);
+            fn add_unmasked(&mut self, c: u32, mask: &[usize]);
+            fn add_truncate(&mut self, c: u32) -> Option<()>;
+        }
+    };
+}
+
+/// Trait for common methods on vector-type structs.
+pub trait BaseVector {
+    fn prime(&self) -> ValidPrime;
+    fn len(&self) -> usize;
+    fn is_empty(&self) -> bool;
+    fn entry(&self, index: usize) -> u32;
+    fn slice<'a>(&self, start: usize, end: usize) -> Slice<'a>
+    where
+        Self: 'a;
+    fn as_slice(&self) -> Slice;
+    fn into_owned(self) -> FpVector;
+    fn is_zero(&self) -> bool;
+    fn iter(&self) -> FpVectorIterator;
+    fn iter_nonzero(&self) -> FpVectorNonZeroIterator;
+    fn first_nonzero(&self) -> Option<(usize, u32)>;
+    fn sign_rule(&self, other: &Self) -> bool;
+    fn density(&self) -> f32;
+}
+
+/// Trait for common methods on mutable vector-type structs.
+pub trait BaseVectorMut: BaseVector {
+    fn scale(&mut self, c: u32);
+    fn set_to_zero(&mut self);
+    fn set_entry(&mut self, index: usize, value: u32);
+    fn assign<'a, T: Into<Slice<'a>>>(&mut self, other: T);
+    fn add<'a, T: Into<Slice<'a>>>(&mut self, other: T, c: u32);
+    fn add_offset<'a, T: Into<Slice<'a>>>(&mut self, other: T, c: u32, offset: usize);
+    fn slice_mut(&mut self, start: usize, end: usize) -> SliceMut;
+    fn as_slice_mut(&mut self) -> SliceMut;
+    fn add_basis_element(&mut self, index: usize, value: u32);
+    fn copy_from_slice(&mut self, slice: &[u32]);
+    fn add_masked<'a, T: Into<Slice<'a>>>(&mut self, other: T, c: u32, mask: &[usize]);
+    fn add_unmasked<'a, T: Into<Slice<'a>>>(&mut self, other: T, c: u32, mask: &[usize]);
+    fn add_truncate<'a, T: Into<Slice<'a>>>(&mut self, other: T, c: u32) -> Option<()>;
+}
+
+// impls for `FpVector`
+
+impl BaseVector for FpVector {
+    dispatch_basevector!();
+
+    fn into_owned(self) -> FpVector {
+        self
+    }
+}
+
+impl BaseVectorMut for FpVector {
+    dispatch_basevectormut!();
+}
+
+impl std::ops::AddAssign<&FpVector> for FpVector {
+    fn add_assign(&mut self, other: &FpVector) {
+        self.add(other, 1);
+    }
+}
+
+impl std::fmt::Display for FpVector {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        self.as_slice().fmt(f)
+    }
+}
+
+impl<'a> IntoIterator for &'a FpVector {
+    type IntoIter = FpVectorIterator<'a>;
+    type Item = u32;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter()
+    }
+}
+
+macro_rules! impl_try_into {
+    ($var:tt, $p:literal) => {
+        impl<'a> TryInto<&'a mut FpVectorP<$p>> for &'a mut FpVector {
+            type Error = ();
+
+            fn try_into(self) -> Result<&'a mut FpVectorP<$p>, ()> {
+                match self {
+                    FpVector::$var(ref mut x) => Ok(x),
+                    _ => Err(()),
+                }
+            }
+        }
+    };
+}
+
+call_macro_p!(impl_try_into);
+
+#[cfg(feature = "json")]
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
+
+#[cfg(feature = "json")]
+impl Serialize for FpVector {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        Vec::<u32>::from(self).serialize(serializer)
+    }
+}
+
+#[cfg(feature = "json")]
+impl<'de> Deserialize<'de> for FpVector {
+    fn deserialize<D>(_deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        panic!("Deserializing FpVector not supported");
+        // This is needed for ext-websocket/actions to be happy
+    }
+}
+
+impl FpVector {
+    dispatch_prime! {
+        pub fn assign_partial(&mut self, other: &Self);
+        pub fn extend_len(&mut self, len: usize);
+        pub fn set_scratch_vector_size(&mut self, len: usize);
+        pub(crate) fn trim_start(&mut self, n: usize);
+        pub fn add_carry(&mut self, other: &Self, c: u32, rest: &mut [FpVector]) -> bool;
+        pub fn update_from_bytes(&mut self, data: &mut impl Read) -> (std::io::Result<()>);
+        pub fn to_bytes(&self, buffer: &mut impl Write) -> (std::io::Result<()>);
+        pub(crate) fn limbs(&self) -> (&[Limb]);
+        pub(crate) fn limbs_mut(&mut self) -> (&mut [Limb]);
+    }
+
+    pub fn new(p: ValidPrime, len: usize) -> FpVector {
+        match_p!(p, FpVectorP::new_(len))
+    }
+
+    pub fn new_with_capacity(p: ValidPrime, len: usize, capacity: usize) -> FpVector {
+        match_p!(p, FpVectorP::new_with_capacity_(len, capacity))
+    }
+
+    pub fn from_slice(p: ValidPrime, slice: &[u32]) -> Self {
+        match_p!(p, FpVectorP::from(&slice))
+    }
+
+    pub fn from_bytes(p: ValidPrime, len: usize, data: &mut impl Read) -> std::io::Result<Self> {
+        Ok(match_p!(p, FpVectorP::from_bytes(p, len, data)?))
+    }
+}
+
+// impls for `SliceMut`
+
+impl<'a> BaseVector for SliceMut<'a> {
+    dispatch_basevector!();
+
+    dispatch_prime! {
+        fn into_owned(self) -> (dispatch FpVector);
+    }
+}
+
+impl<'a> BaseVectorMut for SliceMut<'a> {
+    dispatch_basevectormut!();
+}
+
+impl<'a> From<&'a mut FpVector> for SliceMut<'a> {
+    fn from(vec: &'a mut FpVector) -> Self {
+        vec.as_slice_mut()
+    }
+}
+
+impl<'a> SliceMut<'a> {
+    dispatch_prime! {
+        pub fn copy(&mut self) -> (dispatch SliceMut);
+    }
+
+    pub fn add_tensor(&mut self, offset: usize, coeff: u32, left: Slice, right: Slice) {
+        match_self_left_right_p!(add_tensor(&mut self, offset, coeff; left, right));
+    }
+}
+
+// impls for `Slice`
+
+impl<'a> BaseVector for Slice<'a> {
+    dispatch_basevector!();
+
+    dispatch_prime! {
+        fn into_owned(self) -> (dispatch FpVector);
+    }
+}
+
+impl<'a> std::fmt::Display for Slice<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        if f.alternate() {
+            for v in self.iter() {
+                write!(f, "{v}")?;
+            }
+            Ok(())
+        } else {
+            write!(f, "[{}]", self.iter().format(", "))
+        }
+    }
+}
+
+impl<'a> From<&'a FpVector> for Slice<'a> {
+    fn from(vec: &'a FpVector) -> Self {
+        vec.as_slice()
+    }
+}
+
+impl<'a> From<&'a mut FpVector> for Slice<'a> {
+    fn from(vec: &'a mut FpVector) -> Self {
+        (vec as &'a FpVector).as_slice()
+    }
+}
+
+// impls for `FpVectorNonZeroIterator`
+
+impl<'a> Iterator for FpVectorNonZeroIterator<'a> {
+    type Item = (usize, u32);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next()
+    }
+}
+
+impl<'a> FpVectorNonZeroIterator<'a> {
+    dispatch_prime! {
+        fn next(&mut self) -> (Option<(usize, u32)>);
+    }
+}
+
+// other trait impls
+
+impl_from_ref!(SliceMut, SliceMut, SliceMutP, mut);
+impl_from_ref!(Slice, Slice, SliceP);
+impl_from_ref!(SliceMut, Slice, SliceP);
+
+impl From<&FpVector> for Vec<u32> {
+    fn from(v: &FpVector) -> Vec<u32> {
+        v.iter().collect()
+    }
+}
+
+// Tautological impls
+
+macro_rules! dispatch_prime_tauto_inner {
+    (fn $method:ident(&self $(, $arg:ident: $ty:ty )*) $(-> $ret:ty)?) => {
+        fn $method(&self $(,$arg: $ty)*) $(-> $ret)? {
+            T::$method(self $(,$arg)*)
+        }
+    };
+    (fn $method:ident (&mut self $(, $arg:ident: $ty:ty )*) $(-> $ret:ty)?) => {
+        fn $method (&mut self $(,$arg: $ty)*) $(-> $ret)? {
+            T::$method(self $(,$arg)*)
+        }
+    };
+    (fn $method:ident <'a, S: Into<Slice<'a>>> (&mut self $(, $arg:ident: $ty:ty )*) $(-> $ret:ty)?) => {
+        fn $method <'a, S: Into<Slice<'a>>> (&mut self $(,$arg: $ty)*) $(-> $ret)? {
+            T::$method(self $(,$arg)*)
+        }
+    };
+}
+
+macro_rules! dispatch_prime_tauto {
+    () => {};
+    (fn $method:ident $tt:tt $(-> $ret:ty)?; $($tail:tt)*) => {
+        dispatch_prime_tauto_inner! {
+            fn $method $tt $(-> $ret)?
+        }
+        dispatch_prime_tauto!{$($tail)*}
+    };
+    (fn $method:ident <'a, S: Into<Slice<'a>>> $tt:tt $(-> $ret:ty)?; $($tail:tt)*) => {
+        dispatch_prime_tauto_inner! {
+            fn $method <'a, S: Into<Slice<'a>>> $tt $(-> $ret)?
+        }
+        dispatch_prime_tauto!{$($tail)*}
+    }
+}
+
+impl<T: BaseVector> BaseVector for &T {
+    dispatch_prime_tauto! {
+        fn prime(&self) -> ValidPrime;
+        fn len(&self) -> usize;
+        fn is_empty(&self) -> bool;
+        fn entry(&self, index: usize) -> u32;
+        fn as_slice(&self) -> Slice;
+        fn is_zero(&self) -> bool;
+        fn iter(&self) -> FpVectorIterator;
+        fn iter_nonzero(&self) -> FpVectorNonZeroIterator;
+        fn first_nonzero(&self) -> Option<(usize, u32)>;
+        fn sign_rule(&self, other: &Self) -> bool;
+        fn density(&self) -> f32;
+    }
+
+    fn slice<'b>(&self, start: usize, end: usize) -> Slice<'b>
+    where
+        Self: 'b,
+    {
+        T::slice(self, start, end)
+    }
+
+    fn into_owned(self) -> FpVector {
+        T::as_slice(self).into_owned()
+    }
+}
+
+impl<T: BaseVector> BaseVector for &mut T {
+    dispatch_prime_tauto! {
+        fn prime(&self) -> ValidPrime;
+        fn len(&self) -> usize;
+        fn is_empty(&self) -> bool;
+        fn entry(&self, index: usize) -> u32;
+        fn as_slice(&self) -> Slice;
+        fn is_zero(&self) -> bool;
+        fn iter(&self) -> FpVectorIterator;
+        fn iter_nonzero(&self) -> FpVectorNonZeroIterator;
+        fn first_nonzero(&self) -> Option<(usize, u32)>;
+        fn sign_rule(&self, other: &Self) -> bool;
+        fn density(&self) -> f32;
+    }
+
+    fn slice<'b>(&self, start: usize, end: usize) -> Slice<'b>
+    where
+        Self: 'b,
+    {
+        T::slice(self, start, end)
+    }
+
+    fn into_owned(self) -> FpVector {
+        T::as_slice(self).into_owned()
+    }
+}
+
+impl<T: BaseVectorMut> BaseVectorMut for &mut T {
+    dispatch_prime_tauto! {
+        fn scale(&mut self, c: u32);
+        fn set_to_zero(&mut self);
+        fn set_entry(&mut self, index: usize, value: u32);
+        fn slice_mut(&mut self, start: usize, end: usize) -> SliceMut;
+        fn as_slice_mut(&mut self) -> SliceMut;
+        fn add_basis_element(&mut self, index: usize, value: u32);
+        fn copy_from_slice(&mut self, slice: &[u32]);
+        fn assign<'a, S: Into<Slice<'a>>>(&mut self, other: S);
+        fn add<'a, S: Into<Slice<'a>>>(&mut self, other: S, c: u32);
+        fn add_offset<'a, S: Into<Slice<'a>>>(&mut self, other: S, c: u32, offset: usize);
+        fn add_masked<'a, S: Into<Slice<'a>>>(&mut self, other: S, c: u32, mask: &[usize]);
+        fn add_unmasked<'a, S: Into<Slice<'a>>>(&mut self, other: S, c: u32, mask: &[usize]);
+        fn add_truncate<'a, S: Into<Slice<'a>>>(&mut self, other: S, c: u32) -> Option<()>;
+    }
+}
diff --git a/ext/crates/fp/src/vector/specialized_2.rs b/ext/crates/fp/src/vector/specialized_2.rs
new file mode 100644
index 0000000000..bcb6737595
--- /dev/null
+++ b/ext/crates/fp/src/vector/specialized_2.rs
@@ -0,0 +1,108 @@
+//! This module replaces `specialized` when `odd-primes` is disabled. Instead of producing enum
+//! wrappers, it simply rexports `FooP<2>` as `Foo`.
+
+use super::{
+    base_generic::{BaseVectorMutP, BaseVectorP},
+    generic::{FpVectorIterator, FpVectorNonZeroIteratorP, FpVectorP, SliceMutP, SliceP},
+};
+use crate::prime::ValidPrime;
+
+pub type FpVector = FpVectorP<2>;
+pub type Slice<'a> = SliceP<'a, 2>;
+pub type SliceMut<'a> = SliceMutP<'a, 2>;
+pub type FpVectorNonZeroIterator<'a> = FpVectorNonZeroIteratorP<'a, 2>;
+
+pub trait BaseVector: BaseVectorP<2> {}
+impl<T: BaseVectorP<2>> BaseVector for T {}
+
+pub trait BaseVectorMut: BaseVectorMutP<2> {}
+impl<T: BaseVectorMutP<2>> BaseVectorMut for T {}
+
+impl std::ops::AddAssign<&FpVector> for FpVector {
+    fn add_assign(&mut self, other: &FpVector) {
+        self.add(other, 1);
+    }
+}
+
+impl std::fmt::Display for FpVector {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        self.as_slice().fmt(f)
+    }
+}
+
+impl<'a> IntoIterator for &'a FpVector {
+    type IntoIter = FpVectorIterator<'a>;
+    type Item = u32;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter()
+    }
+}
+
+macro_rules! impl_try_into {
+    ($var:tt, $p:literal) => {
+        impl<'a> TryInto<&'a mut FpVectorP<$p>> for &'a mut FpVector {
+            type Error = ();
+
+            fn try_into(self) -> Result<&'a mut FpVectorP<$p>, ()> {
+                Err(())
+            }
+        }
+    };
+}
+
+impl_try_into!(_3, 3);
+impl_try_into!(_5, 5);
+impl_try_into!(_7, 7);
+
+use itertools::Itertools;
+#[cfg(feature = "json")]
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
+
+#[cfg(feature = "json")]
+impl Serialize for FpVector {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        Vec::<u32>::from(self).serialize(serializer)
+    }
+}
+
+#[cfg(feature = "json")]
+impl<'de> Deserialize<'de> for FpVector {
+    fn deserialize<D>(_deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        panic!("Deserializing FpVector not supported");
+        // This is needed for ext-websocket/actions to be happy
+    }
+}
+
+impl FpVector {
+    pub fn new(_p: ValidPrime, len: usize) -> FpVector {
+        Self::new_(len)
+    }
+
+    pub fn new_with_capacity(_p: ValidPrime, len: usize, capacity: usize) -> FpVector {
+        Self::new_with_capacity_(len, capacity)
+    }
+
+    pub fn from_slice(_p: ValidPrime, slice: &[u32]) -> Self {
+        Self::from(&slice)
+    }
+}
+
+impl<'a> std::fmt::Display for Slice<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        if f.alternate() {
+            for v in self.iter() {
+                write!(f, "{v}")?;
+            }
+            Ok(())
+        } else {
+            write!(f, "[{}]", self.iter().format(", "))
+        }
+    }
+}
diff --git a/ext/crates/fp/src/vector_2.rs b/ext/crates/fp/src/vector_2.rs
deleted file mode 100644
index 53191890ae..0000000000
--- a/ext/crates/fp/src/vector_2.rs
+++ /dev/null
@@ -1,162 +0,0 @@
-//! This module replaces `vector` when `odd-primes` is disabled. Instead of producing enum
-//! wrappers, it simply rexports `FooP<2>` as `Foo`.
-
-use crate::limb::{entries_per_limb_const, Limb};
-use crate::prime::ValidPrime;
-use crate::vector_inner::{FpVectorNonZeroIteratorP, FpVectorP, SliceMutP, SliceP};
-use itertools::Itertools;
-#[cfg(feature = "json")]
-use serde::{Deserialize, Deserializer, Serialize, Serializer};
-
-use std::io::{Read, Write};
-use std::mem::size_of;
-
-pub type FpVector = FpVectorP<2>;
-pub type Slice<'a> = SliceP<'a, 2>;
-pub type SliceMut<'a> = SliceMutP<'a, 2>;
-pub type FpVectorNonZeroIterator<'a> = FpVectorNonZeroIteratorP<'a, 2>;
-
-impl FpVector {
-    pub fn new(_p: ValidPrime, len: usize) -> FpVector {
-        FpVector::new_(len)
-    }
-
-    pub fn new_with_capacity(_p: ValidPrime, len: usize, capacity: usize) -> FpVector {
-        FpVector::new_with_capacity_(len, capacity)
-    }
-
-    pub fn from_slice(_p: ValidPrime, slice: &[u32]) -> Self {
-        Self::from(&slice)
-    }
-
-    pub fn num_limbs(_p: ValidPrime, len: usize) -> usize {
-        let entries_per_limb = entries_per_limb_const::<2>();
-        (len + entries_per_limb - 1) / entries_per_limb
-    }
-
-    #[allow(dead_code)]
-    pub(crate) fn padded_len(p: ValidPrime, len: usize) -> usize {
-        Self::num_limbs(p, len) * entries_per_limb_const::<2>()
-    }
-
-    pub fn update_from_bytes(&mut self, data: &mut impl Read) -> std::io::Result<()> {
-        let limbs = self.limbs_mut();
-        let num_limbs = limbs.len();
-
-        if cfg!(target_endian = "little") {
-            let num_bytes = num_limbs * size_of::<Limb>();
-            unsafe {
-                let buf: &mut [u8] =
-                    std::slice::from_raw_parts_mut(limbs.as_mut_ptr() as *mut u8, num_bytes);
-                data.read_exact(buf).unwrap();
-            }
-        } else {
-            for entry in limbs {
-                let mut bytes: [u8; size_of::<Limb>()] = [0; size_of::<Limb>()];
-                data.read_exact(&mut bytes)?;
-                *entry = Limb::from_le_bytes(bytes);
-            }
-        };
-        Ok(())
-    }
-
-    pub fn from_bytes(p: ValidPrime, len: usize, data: &mut impl Read) -> std::io::Result<Self> {
-        let mut v = Self::new(p, len);
-        v.update_from_bytes(data)?;
-        Ok(v)
-    }
-
-    pub fn to_bytes(&self, buffer: &mut impl Write) -> std::io::Result<()> {
-        // self.limbs is allowed to have more limbs than necessary, but we only save the
-        // necessary ones.
-        let num_limbs = Self::num_limbs(self.prime(), self.len());
-
-        if cfg!(target_endian = "little") {
-            let num_bytes = num_limbs * size_of::<Limb>();
-            unsafe {
-                let buf: &[u8] =
-                    std::slice::from_raw_parts_mut(self.limbs().as_ptr() as *mut u8, num_bytes);
-                buffer.write_all(buf)?;
-            }
-        } else {
-            for limb in &self.limbs()[0..num_limbs] {
-                let bytes = limb.to_le_bytes();
-                buffer.write_all(&bytes)?;
-            }
-        }
-        Ok(())
-    }
-}
-
-impl std::fmt::Display for FpVector {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        self.as_slice().fmt(f)
-    }
-}
-
-impl<'a> std::fmt::Display for Slice<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        if f.alternate() {
-            for v in self.iter() {
-                write!(f, "{v}")?;
-            }
-            Ok(())
-        } else {
-            write!(f, "[{}]", self.iter().format(", "))
-        }
-    }
-}
-
-impl std::ops::AddAssign<&FpVector> for FpVector {
-    fn add_assign(&mut self, other: &FpVector) {
-        self.add(other, 1);
-    }
-}
-
-impl<'a> IntoIterator for &'a FpVector {
-    type IntoIter = crate::vector_inner::FpVectorIterator<'a>;
-    type Item = u32;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.iter()
-    }
-}
-
-#[cfg(feature = "json")]
-impl Serialize for FpVector {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        Vec::<u32>::from(self).serialize(serializer)
-    }
-}
-
-#[cfg(feature = "json")]
-impl<'de> Deserialize<'de> for FpVector {
-    fn deserialize<D>(_deserializer: D) -> Result<Self, D::Error>
-    where
-        D: Deserializer<'de>,
-    {
-        panic!("Deserializing FpVector not supported");
-        // This is needed for ext-websocket/actions to be happy
-    }
-}
-
-impl<'a, 'b> From<&'a mut SliceMut<'b>> for SliceMut<'a> {
-    fn from(slice: &'a mut SliceMut<'b>) -> SliceMut<'a> {
-        slice.copy()
-    }
-}
-
-impl<'a, 'b> From<&'a Slice<'b>> for Slice<'a> {
-    fn from(slice: &'a Slice<'b>) -> Slice<'a> {
-        *slice
-    }
-}
-
-impl<'a, 'b> From<&'a SliceMut<'b>> for Slice<'a> {
-    fn from(slice: &'a SliceMut<'b>) -> Slice<'a> {
-        slice.as_slice()
-    }
-}
diff --git a/ext/crates/fp/src/vector_inner.rs b/ext/crates/fp/src/vector_inner.rs
deleted file mode 100644
index d823b37572..0000000000
--- a/ext/crates/fp/src/vector_inner.rs
+++ /dev/null
@@ -1,1221 +0,0 @@
-// This generates better llvm optimization
-#![allow(clippy::int_plus_one)]
-
-use crate::limb::{self, Limb};
-
-use crate::constants::BITS_PER_LIMB;
-use crate::prime::ValidPrime;
-use std::cmp::Ordering;
-use std::convert::TryInto;
-use std::ops::Range;
-
-use crate::simd;
-
-use itertools::Itertools;
-
-/// An `FpVectorP` is a vector over $\mathbb{F}_p$ for a fixed prime, implemented using const
-/// generics. Due to limitations with const generics, we cannot constrain P to actually be a prime,
-/// so we allow it to be any u32. However, most functions will panic if P is not a prime.
-///
-/// Interally, it packs entries of the vectors into limbs. However, this is an abstraction that
-/// must not leave the `fp` library.
-#[derive(Debug, Hash, Eq, PartialEq, Clone)]
-pub struct FpVectorP<const P: u32> {
-    len: usize,
-    limbs: Vec<Limb>,
-}
-
-/// A SliceP is a slice of an FpVectorP. This immutably borrows the vector and implements Copy
-#[derive(Debug, Copy, Clone)]
-pub struct SliceP<'a, const P: u32> {
-    limbs: &'a [Limb],
-    start: usize,
-    end: usize,
-}
-
-/// A `SliceMutP` is a mutable slice of an `FpVectorP`. This mutably borrows the vector. Since it
-/// is a mutable borrow, it cannot implement `Copy`. However, it has a [`SliceMutP::copy`] function
-/// that imitates the reborrowing, that mutably borrows `SliceMutP` and returns a `SliceMutP` with
-/// a shorter lifetime.
-#[derive(Debug)]
-pub struct SliceMutP<'a, const P: u32> {
-    limbs: &'a mut [Limb],
-    start: usize,
-    end: usize,
-}
-
-impl<const P: u32> FpVectorP<P> {
-    pub fn new_(len: usize) -> Self {
-        let number_of_limbs = limb::number::<P>(len);
-        Self {
-            len,
-            limbs: vec![0; number_of_limbs],
-        }
-    }
-
-    pub fn from_raw_parts(len: usize, limbs: Vec<Limb>) -> Self {
-        debug_assert_eq!(limbs.len(), limb::number::<P>(len));
-        Self { len, limbs }
-    }
-
-    pub fn new_with_capacity_(len: usize, capacity: usize) -> Self {
-        let mut limbs = Vec::with_capacity(limb::number::<P>(capacity));
-        limbs.resize(limb::number::<P>(len), 0);
-        Self { len, limbs }
-    }
-
-    pub const fn len(&self) -> usize {
-        self.len
-    }
-
-    pub const fn is_empty(&self) -> bool {
-        self.len == 0
-    }
-
-    pub const fn prime(&self) -> ValidPrime {
-        ValidPrime::new(P)
-    }
-
-    #[must_use]
-    pub fn slice(&self, start: usize, end: usize) -> SliceP<'_, P> {
-        assert!(start <= end && end <= self.len);
-        SliceP {
-            limbs: &self.limbs,
-            start,
-            end,
-        }
-    }
-
-    #[must_use]
-    pub fn slice_mut(&mut self, start: usize, end: usize) -> SliceMutP<'_, P> {
-        assert!(start <= end && end <= self.len);
-        SliceMutP {
-            limbs: &mut self.limbs,
-            start,
-            end,
-        }
-    }
-
-    #[inline]
-    #[must_use]
-    pub fn as_slice(&self) -> SliceP<'_, P> {
-        self.into()
-    }
-
-    #[inline]
-    #[must_use]
-    pub fn as_slice_mut(&mut self) -> SliceMutP<'_, P> {
-        self.into()
-    }
-
-    pub fn add_basis_element(&mut self, index: usize, value: u32) {
-        self.as_slice_mut().add_basis_element(index, value);
-    }
-
-    pub fn entry(&self, index: usize) -> u32 {
-        self.as_slice().entry(index)
-    }
-
-    pub fn set_entry(&mut self, index: usize, value: u32) {
-        self.as_slice_mut().set_entry(index, value);
-    }
-
-    pub fn iter(&self) -> FpVectorIterator {
-        self.as_slice().iter()
-    }
-
-    pub fn iter_nonzero(&self) -> FpVectorNonZeroIteratorP<'_, P> {
-        self.as_slice().iter_nonzero()
-    }
-
-    pub fn set_to_zero(&mut self) {
-        for limb in &mut self.limbs {
-            *limb = 0;
-        }
-    }
-
-    pub fn scale(&mut self, c: u32) {
-        match P {
-            2 => {
-                if c == 0 {
-                    self.set_to_zero()
-                }
-            }
-            3 | 5 => {
-                for limb in &mut self.limbs {
-                    *limb = limb::reduce::<P>(*limb * c as Limb);
-                }
-            }
-            _ => {
-                for limb in &mut self.limbs {
-                    *limb = limb::pack::<_, P>(limb::unpack::<P>(*limb).map(|x| (x * c) % P));
-                }
-            }
-        }
-    }
-
-    /// Add `other` to `self` on the assumption that the first `offset` entries of `other` are
-    /// empty.
-    pub fn add_offset(&mut self, other: &FpVectorP<P>, c: u32, offset: usize) {
-        assert_eq!(self.len(), other.len());
-        let min_limb = offset / limb::entries_per_limb_const::<P>();
-        if P == 2 {
-            if c != 0 {
-                simd::add_simd(&mut self.limbs, &other.limbs, min_limb);
-            }
-        } else {
-            for (left, right) in self.limbs.iter_mut().zip_eq(&other.limbs).skip(min_limb) {
-                *left = limb::add::<P>(*left, *right, c);
-            }
-            for limb in &mut self.limbs[min_limb..] {
-                *limb = limb::reduce::<P>(*limb);
-            }
-        }
-    }
-
-    /// Add `other` to `self` on the assumption that the first `offset` entries of `other` are
-    /// empty.
-    pub fn add_offset_nosimd(&mut self, other: &FpVectorP<P>, c: u32, offset: usize) {
-        assert_eq!(self.len(), other.len());
-        let min_limb = offset / limb::entries_per_limb_const::<P>();
-        if P == 2 {
-            if c != 0 {
-                for i in 0..self.limbs.len() {
-                    self.limbs[i] ^= other.limbs[i];
-                }
-            }
-        } else {
-            for (left, right) in self.limbs.iter_mut().zip_eq(&other.limbs).skip(min_limb) {
-                *left = limb::add::<P>(*left, *right, c);
-            }
-            for limb in &mut self.limbs[min_limb..] {
-                *limb = limb::reduce::<P>(*limb);
-            }
-        }
-    }
-
-    pub fn add(&mut self, other: &FpVectorP<P>, c: u32) {
-        self.add_offset(other, c, 0);
-    }
-
-    pub fn add_nosimd(&mut self, other: &FpVectorP<P>, c: u32) {
-        self.add_offset_nosimd(other, c, 0);
-    }
-
-    pub fn assign(&mut self, other: &Self) {
-        debug_assert_eq!(self.len(), other.len());
-        self.limbs.copy_from_slice(&other.limbs)
-    }
-
-    /// A version of [`FpVectorP::assign`] that allows `other` to be shorter than `self`.
-    pub fn assign_partial(&mut self, other: &Self) {
-        debug_assert!(other.len() <= self.len());
-        self.limbs[0..other.limbs.len()].copy_from_slice(&other.limbs);
-        for limb in self.limbs[other.limbs.len()..].iter_mut() {
-            *limb = 0;
-        }
-    }
-
-    pub fn is_zero(&self) -> bool {
-        self.limbs.iter().all(|&x| x == 0)
-    }
-
-    pub(crate) fn limbs(&self) -> &[Limb] {
-        &self.limbs
-    }
-
-    pub(crate) fn limbs_mut(&mut self) -> &mut [Limb] {
-        &mut self.limbs
-    }
-
-    /// This function ensures the length of the vector is at least `len`. See also
-    /// `set_scratch_vector_size`.
-    pub fn extend_len(&mut self, len: usize) {
-        if self.len >= len {
-            return;
-        }
-        self.len = len;
-        self.limbs.resize(limb::number::<P>(len), 0);
-    }
-
-    /// This clears the vector and sets the length to `len`. This is useful for reusing
-    /// allocations of temporary vectors.
-    pub fn set_scratch_vector_size(&mut self, len: usize) {
-        self.limbs.clear();
-        self.limbs.resize(limb::number::<P>(len), 0);
-        self.len = len;
-    }
-
-    /// This replaces the contents of the vector with the contents of the slice. The two must have
-    /// the same length.
-    pub fn copy_from_slice(&mut self, slice: &[u32]) {
-        assert_eq!(self.len, slice.len());
-
-        self.limbs.clear();
-        self.limbs.extend(
-            slice
-                .chunks(limb::entries_per_limb_const::<P>())
-                .map(|x| limb::pack::<_, P>(x.iter().copied())),
-        );
-    }
-
-    /// Permanently remove the first `n` elements in the vector. `n` must be a multiple of
-    /// the number of entries per limb
-    pub(crate) fn trim_start(&mut self, n: usize) {
-        assert!(n <= self.len);
-        let entries_per = limb::entries_per_limb_const::<P>();
-        assert_eq!(n % entries_per, 0);
-        let num_limbs = n / entries_per;
-        self.limbs.drain(0..num_limbs);
-        self.len -= n;
-    }
-
-    pub fn sign_rule(&self, other: &Self) -> bool {
-        assert_eq!(P, 2);
-        let mut result = 0;
-        for target_limb_idx in 0..self.limbs.len() {
-            let target_limb = other.limbs[target_limb_idx];
-            let source_limb = self.limbs[target_limb_idx];
-            result ^= limb::sign_rule(target_limb, source_limb);
-            if target_limb.count_ones() % 2 == 0 {
-                continue;
-            }
-            for _ in 0..target_limb_idx {
-                result ^= source_limb.count_ones() % 2;
-            }
-        }
-        result == 1
-    }
-
-    pub fn add_truncate(&mut self, other: &Self, c: u32) -> Option<()> {
-        for (left, right) in self.limbs.iter_mut().zip_eq(&other.limbs) {
-            *left = limb::add::<P>(*left, *right, c);
-            *left = limb::truncate::<P>(*left)?;
-        }
-        Some(())
-    }
-
-    fn add_carry_limb<T>(&mut self, idx: usize, source: Limb, c: u32, rest: &mut [T]) -> bool
-    where
-        for<'a> &'a mut T: TryInto<&'a mut Self>,
-    {
-        if P == 2 {
-            if c == 0 {
-                return false;
-            }
-            let mut cur_vec = self;
-            let mut carry = source;
-            for carry_vec in rest.iter_mut() {
-                let carry_vec = carry_vec
-                    .try_into()
-                    .ok()
-                    .expect("rest vectors in add_carry must be of the same prime");
-                let rem = cur_vec.limbs[idx] ^ carry;
-                let quot = cur_vec.limbs[idx] & carry;
-                cur_vec.limbs[idx] = rem;
-                carry = quot;
-                cur_vec = carry_vec;
-                if quot == 0 {
-                    return false;
-                }
-            }
-            cur_vec.limbs[idx] ^= carry;
-            true
-        } else {
-            unimplemented!()
-        }
-    }
-
-    pub fn add_carry<T>(&mut self, other: &Self, c: u32, rest: &mut [T]) -> bool
-    where
-        for<'a> &'a mut T: TryInto<&'a mut Self>,
-    {
-        let mut result = false;
-        for i in 0..self.limbs.len() {
-            result |= self.add_carry_limb(i, other.limbs[i], c, rest);
-        }
-        result
-    }
-
-    /// Find the index and value of the first non-zero entry of the vector. `None` if the vector is zero.
-    pub fn first_nonzero(&self) -> Option<(usize, u32)> {
-        let entries_per_limb = limb::entries_per_limb_const::<P>();
-        let bit_length = limb::bit_length_const::<P>();
-        let bitmask = limb::bitmask::<P>();
-        for (i, &limb) in self.limbs.iter().enumerate() {
-            if limb == 0 {
-                continue;
-            }
-            let index = limb.trailing_zeros() as usize / bit_length;
-            return Some((
-                i * entries_per_limb + index,
-                ((limb >> (index * bit_length)) & bitmask) as u32,
-            ));
-        }
-        None
-    }
-
-    pub fn density(&self) -> f32 {
-        let num_nonzero = if P == 2 {
-            self.limbs
-                .iter()
-                .copied()
-                .map(Limb::count_ones)
-                .sum::<u32>() as usize
-        } else {
-            self.iter_nonzero().count()
-        };
-        num_nonzero as f32 / self.len() as f32
-    }
-}
-
-impl<'a, const P: u32> From<&'a FpVectorP<P>> for SliceP<'a, P> {
-    fn from(v: &'a FpVectorP<P>) -> Self {
-        v.slice(0, v.len)
-    }
-}
-
-impl<'a, const P: u32> From<&'a mut FpVectorP<P>> for SliceMutP<'a, P> {
-    fn from(v: &'a mut FpVectorP<P>) -> Self {
-        v.slice_mut(0, v.len)
-    }
-}
-
-impl<'a, const P: u32> SliceMutP<'a, P> {
-    pub fn slice_mut(&mut self, start: usize, end: usize) -> SliceMutP<'_, P> {
-        assert!(start <= end && end <= self.as_slice().len());
-
-        SliceMutP {
-            limbs: &mut *self.limbs,
-            start: self.start + start,
-            end: self.start + end,
-        }
-    }
-
-    #[inline]
-    #[must_use]
-    pub fn as_slice(&self) -> SliceP<'_, P> {
-        SliceP {
-            limbs: &*self.limbs,
-            start: self.start,
-            end: self.end,
-        }
-    }
-
-    /// Generates a version of itself with a shorter lifetime
-    #[inline]
-    #[must_use]
-    pub fn copy(&mut self) -> SliceMutP<'_, P> {
-        SliceMutP {
-            limbs: self.limbs,
-            start: self.start,
-            end: self.end,
-        }
-    }
-}
-
-impl<'a, const P: u32> SliceP<'a, P> {
-    #[must_use]
-    pub fn slice(self, start: usize, end: usize) -> SliceP<'a, P> {
-        assert!(start <= end && end <= self.len());
-
-        SliceP {
-            limbs: self.limbs,
-            start: self.start + start,
-            end: self.start + end,
-        }
-    }
-
-    /// Converts a slice to an owned FpVectorP. This is vastly more efficient if the start of the vector is aligned.
-    #[must_use]
-    pub fn to_owned(self) -> FpVectorP<P> {
-        let mut new = FpVectorP::<P>::new_(self.len());
-        if self.start % limb::entries_per_limb_const::<P>() == 0 {
-            let limb_range = self.limb_range();
-            new.limbs[0..limb_range.len()].copy_from_slice(&self.limbs[limb_range]);
-            if !new.limbs.is_empty() {
-                let len = new.limbs.len();
-                new.limbs[len - 1] &= self.limb_masks().1;
-            }
-        } else {
-            new.as_slice_mut().assign(self);
-        }
-        new
-    }
-}
-
-// Public methods
-impl<'a, const P: u32> SliceP<'a, P> {
-    pub fn prime(&self) -> ValidPrime {
-        ValidPrime::new(P)
-    }
-
-    pub fn len(&self) -> usize {
-        self.end - self.start
-    }
-
-    pub const fn is_empty(&self) -> bool {
-        self.start == self.end
-    }
-
-    pub fn entry(&self, index: usize) -> u32 {
-        debug_assert!(
-            index < self.len(),
-            "Index {} too large, length of vector is only {}.",
-            index,
-            self.len()
-        );
-        let bit_mask = limb::bitmask::<P>();
-        let limb_index = limb::limb_bit_index_pair::<P>(index + self.start);
-        let mut result = self.limbs[limb_index.limb];
-        result >>= limb_index.bit_index;
-        result &= bit_mask;
-        result as u32
-    }
-
-    /// TODO: implement prime 2 version
-    pub fn iter(self) -> FpVectorIterator<'a> {
-        FpVectorIterator::new(self)
-    }
-
-    pub fn iter_nonzero(self) -> FpVectorNonZeroIteratorP<'a, P> {
-        FpVectorNonZeroIteratorP::new(self)
-    }
-
-    pub fn is_zero(&self) -> bool {
-        let limb_range = self.limb_range();
-        if limb_range.is_empty() {
-            return true;
-        }
-        let (min_mask, max_mask) = self.limb_masks();
-        if self.limbs[limb_range.start] & min_mask != 0 {
-            return false;
-        }
-
-        let inner_range = self.limb_range_inner();
-        if !inner_range.is_empty() && self.limbs[inner_range].iter().any(|&x| x != 0) {
-            return false;
-        }
-        if self.limbs[limb_range.end - 1] & max_mask != 0 {
-            return false;
-        }
-        true
-    }
-}
-
-// Limb methods
-impl<'a, const P: u32> SliceP<'a, P> {
-    #[inline]
-    fn offset(&self) -> usize {
-        let bit_length = limb::bit_length_const::<P>();
-        let entries_per_limb = limb::entries_per_limb_const::<P>();
-        (self.start % entries_per_limb) * bit_length
-    }
-
-    #[inline]
-    fn limb_range(&self) -> Range<usize> {
-        limb::range::<P>(self.start, self.end)
-    }
-
-    /// This function underflows if `self.end == 0`, which happens if and only if we are taking a
-    /// slice of width 0 at the start of an `FpVector`. This should be a very rare edge case.
-    /// Dealing with the underflow properly would probably require using `saturating_sub` or
-    /// something of that nature, and that has a nontrivial (10%) performance hit.
-    #[inline]
-    fn limb_range_inner(&self) -> Range<usize> {
-        let range = self.limb_range();
-        (range.start + 1)..(usize::max(range.start + 1, range.end - 1))
-    }
-
-    #[inline(always)]
-    fn min_limb_mask(&self) -> Limb {
-        !0 << self.offset()
-    }
-
-    #[inline(always)]
-    fn max_limb_mask(&self) -> Limb {
-        let num_entries = 1 + (self.end - 1) % limb::entries_per_limb_const::<P>();
-        let bit_max = num_entries * limb::bit_length_const::<P>();
-
-        (!0) >> (BITS_PER_LIMB - bit_max)
-    }
-
-    #[inline(always)]
-    fn limb_masks(&self) -> (Limb, Limb) {
-        if self.limb_range().len() == 1 {
-            (
-                self.min_limb_mask() & self.max_limb_mask(),
-                self.min_limb_mask() & self.max_limb_mask(),
-            )
-        } else {
-            (self.min_limb_mask(), self.max_limb_mask())
-        }
-    }
-}
-
-impl<'a, const P: u32> SliceMutP<'a, P> {
-    pub fn prime(&self) -> ValidPrime {
-        ValidPrime::new(P)
-    }
-
-    pub fn add_basis_element(&mut self, index: usize, value: u32) {
-        if P == 2 {
-            // Checking for value % 2 == 0 appears to be less performant
-            let pair = limb::limb_bit_index_pair::<2>(index + self.start);
-            self.limbs[pair.limb] ^= (value as Limb % 2) << pair.bit_index;
-        } else {
-            let mut x = self.as_slice().entry(index);
-            x += value;
-            x %= P;
-            self.set_entry(index, x);
-        }
-    }
-
-    pub fn set_entry(&mut self, index: usize, value: u32) {
-        debug_assert!(index < self.as_slice().len());
-        let bit_mask = limb::bitmask::<P>();
-        let limb_index = limb::limb_bit_index_pair::<P>(index + self.start);
-        let mut result = self.limbs[limb_index.limb];
-        result &= !(bit_mask << limb_index.bit_index);
-        result |= (value as Limb) << limb_index.bit_index;
-        self.limbs[limb_index.limb] = result;
-    }
-
-    fn reduce_limbs(&mut self) {
-        if P != 2 {
-            let limb_range = self.as_slice().limb_range();
-
-            for limb in &mut self.limbs[limb_range] {
-                *limb = limb::reduce::<P>(*limb);
-            }
-        }
-    }
-
-    pub fn scale(&mut self, c: u32) {
-        if P == 2 {
-            if c == 0 {
-                self.set_to_zero();
-            }
-            return;
-        }
-
-        let c = c as Limb;
-        let limb_range = self.as_slice().limb_range();
-        if limb_range.is_empty() {
-            return;
-        }
-        let (min_mask, max_mask) = self.as_slice().limb_masks();
-
-        let limb = self.limbs[limb_range.start];
-        let masked_limb = limb & min_mask;
-        let rest_limb = limb & !min_mask;
-        self.limbs[limb_range.start] = (masked_limb * c) | rest_limb;
-
-        let inner_range = self.as_slice().limb_range_inner();
-        for limb in &mut self.limbs[inner_range] {
-            *limb *= c;
-        }
-        if limb_range.len() > 1 {
-            let full_limb = self.limbs[limb_range.end - 1];
-            let masked_limb = full_limb & max_mask;
-            let rest_limb = full_limb & !max_mask;
-            self.limbs[limb_range.end - 1] = (masked_limb * c) | rest_limb;
-        }
-        self.reduce_limbs();
-    }
-
-    pub fn set_to_zero(&mut self) {
-        let limb_range = self.as_slice().limb_range();
-        if limb_range.is_empty() {
-            return;
-        }
-        let (min_mask, max_mask) = self.as_slice().limb_masks();
-        self.limbs[limb_range.start] &= !min_mask;
-
-        let inner_range = self.as_slice().limb_range_inner();
-        for limb in &mut self.limbs[inner_range] {
-            *limb = 0;
-        }
-        self.limbs[limb_range.end - 1] &= !max_mask;
-    }
-
-    pub fn add(&mut self, other: SliceP<'_, P>, c: u32) {
-        debug_assert!(c < P);
-        if self.as_slice().is_empty() {
-            return;
-        }
-
-        if P == 2 {
-            if c != 0 {
-                match self.as_slice().offset().cmp(&other.offset()) {
-                    Ordering::Equal => self.add_shift_none(other, 1),
-                    Ordering::Less => self.add_shift_left(other, 1),
-                    Ordering::Greater => self.add_shift_right(other, 1),
-                };
-            }
-        } else {
-            match self.as_slice().offset().cmp(&other.offset()) {
-                Ordering::Equal => self.add_shift_none(other, c),
-                Ordering::Less => self.add_shift_left(other, c),
-                Ordering::Greater => self.add_shift_right(other, c),
-            };
-        }
-    }
-
-    /// `coeff` need not be reduced mod p.
-    /// Adds v otimes w to self.
-    pub fn add_tensor(&mut self, offset: usize, coeff: u32, left: SliceP<P>, right: SliceP<P>) {
-        let right_dim = right.len();
-
-        for (i, v) in left.iter_nonzero() {
-            let entry = (v * coeff) % *self.prime();
-            self.slice_mut(offset + i * right_dim, offset + (i + 1) * right_dim)
-                .add(right, entry);
-        }
-    }
-
-    /// TODO: improve efficiency
-    pub fn assign(&mut self, other: SliceP<'_, P>) {
-        if self.as_slice().offset() != other.offset() {
-            self.set_to_zero();
-            self.add(other, 1);
-            return;
-        }
-        let target_range = self.as_slice().limb_range();
-        let source_range = other.limb_range();
-
-        if target_range.is_empty() {
-            return;
-        }
-
-        let (min_mask, max_mask) = other.limb_masks();
-
-        let result = other.limbs[source_range.start] & min_mask;
-        self.limbs[target_range.start] &= !min_mask;
-        self.limbs[target_range.start] |= result;
-
-        let target_inner_range = self.as_slice().limb_range_inner();
-        let source_inner_range = other.limb_range_inner();
-        if !target_inner_range.is_empty() && !source_inner_range.is_empty() {
-            self.limbs[target_inner_range].clone_from_slice(&other.limbs[source_inner_range]);
-        }
-
-        let result = other.limbs[source_range.end - 1] & max_mask;
-        self.limbs[target_range.end - 1] &= !max_mask;
-        self.limbs[target_range.end - 1] |= result;
-    }
-
-    /// Adds `c` * `other` to `self`. `other` must have the same length, offset, and prime as self, and `c` must be between `0` and `p - 1`.
-    pub fn add_shift_none(&mut self, other: SliceP<'_, P>, c: u32) {
-        let target_range = self.as_slice().limb_range();
-        let source_range = other.limb_range();
-
-        let (min_mask, max_mask) = other.limb_masks();
-
-        self.limbs[target_range.start] = limb::add::<P>(
-            self.limbs[target_range.start],
-            other.limbs[source_range.start] & min_mask,
-            c,
-        );
-        self.limbs[target_range.start] = limb::reduce::<P>(self.limbs[target_range.start]);
-
-        let target_inner_range = self.as_slice().limb_range_inner();
-        let source_inner_range = other.limb_range_inner();
-        if !source_inner_range.is_empty() {
-            for (left, right) in self.limbs[target_inner_range]
-                .iter_mut()
-                .zip_eq(&other.limbs[source_inner_range])
-            {
-                *left = limb::add::<P>(*left, *right, c);
-                *left = limb::reduce::<P>(*left);
-            }
-        }
-        if source_range.len() > 1 {
-            // The first and last limbs are distinct, so we process the last.
-            self.limbs[target_range.end - 1] = limb::add::<P>(
-                self.limbs[target_range.end - 1],
-                other.limbs[source_range.end - 1] & max_mask,
-                c,
-            );
-            self.limbs[target_range.end - 1] = limb::reduce::<P>(self.limbs[target_range.end - 1]);
-        }
-    }
-
-    fn add_shift_left(&mut self, other: SliceP<'_, P>, c: u32) {
-        let dat = AddShiftLeftData::new(self.as_slice(), other);
-        let mut i = 0;
-        {
-            self.limbs[i + dat.min_target_limb] = limb::add::<P>(
-                self.limbs[i + dat.min_target_limb],
-                dat.mask_first_limb(other, i + dat.min_source_limb),
-                c,
-            );
-        }
-        for i in 1..dat.number_of_source_limbs - 1 {
-            self.limbs[i + dat.min_target_limb] = limb::add::<P>(
-                self.limbs[i + dat.min_target_limb],
-                dat.mask_middle_limb_a(other, i + dat.min_source_limb),
-                c,
-            );
-            self.limbs[i + dat.min_target_limb - 1] = limb::add::<P>(
-                self.limbs[i + dat.min_target_limb - 1],
-                dat.mask_middle_limb_b(other, i + dat.min_source_limb),
-                c,
-            );
-            self.limbs[i + dat.min_target_limb - 1] =
-                limb::reduce::<P>(self.limbs[i + dat.min_target_limb - 1]);
-        }
-        i = dat.number_of_source_limbs - 1;
-        if i > 0 {
-            self.limbs[i + dat.min_target_limb - 1] = limb::add::<P>(
-                self.limbs[i + dat.min_target_limb - 1],
-                dat.mask_last_limb_a(other, i + dat.min_source_limb),
-                c,
-            );
-            self.limbs[i + dat.min_target_limb - 1] =
-                limb::reduce::<P>(self.limbs[i + dat.min_target_limb - 1]);
-            if dat.number_of_source_limbs == dat.number_of_target_limbs {
-                self.limbs[i + dat.min_target_limb] = limb::add::<P>(
-                    self.limbs[i + dat.min_target_limb],
-                    dat.mask_last_limb_b(other, i + dat.min_source_limb),
-                    c,
-                );
-                self.limbs[i + dat.min_target_limb] =
-                    limb::reduce::<P>(self.limbs[i + dat.min_target_limb]);
-            }
-        } else {
-            self.limbs[i + dat.min_target_limb] =
-                limb::reduce::<P>(self.limbs[i + dat.min_target_limb]);
-        }
-    }
-
-    fn add_shift_right(&mut self, other: SliceP<'_, P>, c: u32) {
-        let dat = AddShiftRightData::new(self.as_slice(), other);
-        let mut i = 0;
-        {
-            self.limbs[i + dat.min_target_limb] = limb::add::<P>(
-                self.limbs[i + dat.min_target_limb],
-                dat.mask_first_limb_a(other, i + dat.min_source_limb),
-                c,
-            );
-            self.limbs[i + dat.min_target_limb] =
-                limb::reduce::<P>(self.limbs[i + dat.min_target_limb]);
-            if dat.number_of_target_limbs > 1 {
-                self.limbs[i + dat.min_target_limb + 1] = limb::add::<P>(
-                    self.limbs[i + dat.min_target_limb + 1],
-                    dat.mask_first_limb_b(other, i + dat.min_source_limb),
-                    c,
-                );
-            }
-        }
-        for i in 1..dat.number_of_source_limbs - 1 {
-            self.limbs[i + dat.min_target_limb] = limb::add::<P>(
-                self.limbs[i + dat.min_target_limb],
-                dat.mask_middle_limb_a(other, i + dat.min_source_limb),
-                c,
-            );
-            self.limbs[i + dat.min_target_limb] =
-                limb::reduce::<P>(self.limbs[i + dat.min_target_limb]);
-            self.limbs[i + dat.min_target_limb + 1] = limb::add::<P>(
-                self.limbs[i + dat.min_target_limb + 1],
-                dat.mask_middle_limb_b(other, i + dat.min_source_limb),
-                c,
-            );
-        }
-        i = dat.number_of_source_limbs - 1;
-        if i > 0 {
-            self.limbs[i + dat.min_target_limb] = limb::add::<P>(
-                self.limbs[i + dat.min_target_limb],
-                dat.mask_last_limb_a(other, i + dat.min_source_limb),
-                c,
-            );
-            self.limbs[i + dat.min_target_limb] =
-                limb::reduce::<P>(self.limbs[i + dat.min_target_limb]);
-            if dat.number_of_target_limbs > dat.number_of_source_limbs {
-                self.limbs[i + dat.min_target_limb + 1] = limb::add::<P>(
-                    self.limbs[i + dat.min_target_limb + 1],
-                    dat.mask_last_limb_b(other, i + dat.min_source_limb),
-                    c,
-                );
-            }
-        }
-        if dat.number_of_target_limbs > dat.number_of_source_limbs {
-            self.limbs[i + dat.min_target_limb + 1] =
-                limb::reduce::<P>(self.limbs[i + dat.min_target_limb + 1]);
-        }
-    }
-
-    /// Given a mask v, add the `v[i]`th entry of `other` to the `i`th entry of `self`.
-    pub fn add_masked(&mut self, other: SliceP<'_, P>, c: u32, mask: &[usize]) {
-        // TODO: If this ends up being a bottleneck, try to use PDEP/PEXT
-        assert_eq!(self.as_slice().len(), mask.len());
-        for (i, &x) in mask.iter().enumerate() {
-            let entry = other.entry(x);
-            if entry != 0 {
-                self.add_basis_element(i, entry * c);
-            }
-        }
-    }
-
-    /// Given a mask v, add the `i`th entry of `other` to the `v[i]`th entry of `self`.
-    pub fn add_unmasked(&mut self, other: SliceP<'_, P>, c: u32, mask: &[usize]) {
-        assert!(other.len() <= mask.len());
-        for (i, v) in other.iter_nonzero() {
-            self.add_basis_element(mask[i], v * c);
-        }
-    }
-}
-
-struct AddShiftLeftData {
-    offset_shift: usize,
-    tail_shift: usize,
-    zero_bits: usize,
-    min_source_limb: usize,
-    min_target_limb: usize,
-    number_of_source_limbs: usize,
-    number_of_target_limbs: usize,
-    min_mask: Limb,
-    max_mask: Limb,
-}
-
-impl AddShiftLeftData {
-    fn new<const P: u32>(target: SliceP<'_, P>, source: SliceP<'_, P>) -> Self {
-        debug_assert!(target.prime() == source.prime());
-        debug_assert!(target.offset() <= source.offset());
-        debug_assert!(
-            target.len() == source.len(),
-            "self.dim {} not equal to other.dim {}",
-            target.len(),
-            source.len()
-        );
-        let offset_shift = source.offset() - target.offset();
-        let bit_length = limb::bit_length_const::<P>();
-        let entries_per_limb = limb::entries_per_limb_const::<P>();
-        let usable_bits_per_limb = bit_length * entries_per_limb;
-        let tail_shift = usable_bits_per_limb - offset_shift;
-        let zero_bits = BITS_PER_LIMB - usable_bits_per_limb;
-        let source_range = source.limb_range();
-        let target_range = target.limb_range();
-        let min_source_limb = source_range.start;
-        let min_target_limb = target_range.start;
-        let number_of_source_limbs = source_range.len();
-        let number_of_target_limbs = target_range.len();
-        let (min_mask, max_mask) = source.limb_masks();
-
-        Self {
-            offset_shift,
-            tail_shift,
-            zero_bits,
-            min_source_limb,
-            min_target_limb,
-            number_of_source_limbs,
-            number_of_target_limbs,
-            min_mask,
-            max_mask,
-        }
-    }
-
-    fn mask_first_limb<const P: u32>(&self, other: SliceP<'_, P>, i: usize) -> Limb {
-        (other.limbs[i] & self.min_mask) >> self.offset_shift
-    }
-
-    fn mask_middle_limb_a<const P: u32>(&self, other: SliceP<'_, P>, i: usize) -> Limb {
-        other.limbs[i] >> self.offset_shift
-    }
-
-    fn mask_middle_limb_b<const P: u32>(&self, other: SliceP<'_, P>, i: usize) -> Limb {
-        (other.limbs[i] << (self.tail_shift + self.zero_bits)) >> self.zero_bits
-    }
-
-    fn mask_last_limb_a<const P: u32>(&self, other: SliceP<'_, P>, i: usize) -> Limb {
-        let source_limb_masked = other.limbs[i] & self.max_mask;
-        source_limb_masked << self.tail_shift
-    }
-
-    fn mask_last_limb_b<const P: u32>(&self, other: SliceP<'_, P>, i: usize) -> Limb {
-        let source_limb_masked = other.limbs[i] & self.max_mask;
-        source_limb_masked >> self.offset_shift
-    }
-}
-
-struct AddShiftRightData {
-    offset_shift: usize,
-    tail_shift: usize,
-    zero_bits: usize,
-    min_source_limb: usize,
-    min_target_limb: usize,
-    number_of_source_limbs: usize,
-    number_of_target_limbs: usize,
-    min_mask: Limb,
-    max_mask: Limb,
-}
-
-impl AddShiftRightData {
-    fn new<const P: u32>(target: SliceP<'_, P>, source: SliceP<'_, P>) -> Self {
-        debug_assert!(target.prime() == source.prime());
-        debug_assert!(target.offset() >= source.offset());
-        debug_assert!(
-            target.len() == source.len(),
-            "self.dim {} not equal to other.dim {}",
-            target.len(),
-            source.len()
-        );
-        let offset_shift = target.offset() - source.offset();
-        let bit_length = limb::bit_length_const::<P>();
-        let entries_per_limb = limb::entries_per_limb_const::<P>();
-        let usable_bits_per_limb = bit_length * entries_per_limb;
-        let tail_shift = usable_bits_per_limb - offset_shift;
-        let zero_bits = BITS_PER_LIMB - usable_bits_per_limb;
-        let source_range = source.limb_range();
-        let target_range = target.limb_range();
-        let min_source_limb = source_range.start;
-        let min_target_limb = target_range.start;
-        let number_of_source_limbs = source_range.len();
-        let number_of_target_limbs = target_range.len();
-        let (min_mask, max_mask) = source.limb_masks();
-        Self {
-            offset_shift,
-            tail_shift,
-            zero_bits,
-            min_source_limb,
-            min_target_limb,
-            number_of_source_limbs,
-            number_of_target_limbs,
-            min_mask,
-            max_mask,
-        }
-    }
-
-    fn mask_first_limb_a<const P: u32>(&self, other: SliceP<'_, P>, i: usize) -> Limb {
-        let source_limb_masked = other.limbs[i] & self.min_mask;
-        (source_limb_masked << (self.offset_shift + self.zero_bits)) >> self.zero_bits
-    }
-
-    fn mask_first_limb_b<const P: u32>(&self, other: SliceP<'_, P>, i: usize) -> Limb {
-        let source_limb_masked = other.limbs[i] & self.min_mask;
-        source_limb_masked >> self.tail_shift
-    }
-
-    fn mask_middle_limb_a<const P: u32>(&self, other: SliceP<'_, P>, i: usize) -> Limb {
-        (other.limbs[i] << (self.offset_shift + self.zero_bits)) >> self.zero_bits
-    }
-
-    fn mask_middle_limb_b<const P: u32>(&self, other: SliceP<'_, P>, i: usize) -> Limb {
-        other.limbs[i] >> self.tail_shift
-    }
-
-    fn mask_last_limb_a<const P: u32>(&self, other: SliceP<'_, P>, i: usize) -> Limb {
-        let source_limb_masked = other.limbs[i] & self.max_mask;
-        source_limb_masked << self.offset_shift
-    }
-
-    fn mask_last_limb_b<const P: u32>(&self, other: SliceP<'_, P>, i: usize) -> Limb {
-        let source_limb_masked = other.limbs[i] & self.max_mask;
-        source_limb_masked >> self.tail_shift
-    }
-}
-
-impl<T: AsRef<[u32]>, const P: u32> From<&T> for FpVectorP<P> {
-    fn from(slice: &T) -> Self {
-        let mut v = Self::new_(slice.as_ref().len());
-        v.limbs.clear();
-        v.limbs.extend(
-            slice
-                .as_ref()
-                .chunks(limb::entries_per_limb_const::<P>())
-                .map(|x| limb::pack::<_, P>(x.iter().copied())),
-        );
-        v
-    }
-}
-
-impl<const P: u32> From<&FpVectorP<P>> for Vec<u32> {
-    fn from(vec: &FpVectorP<P>) -> Vec<u32> {
-        vec.iter().collect()
-    }
-}
-
-pub struct FpVectorIterator<'a> {
-    limbs: &'a [Limb],
-    bit_length: usize,
-    bit_mask: Limb,
-    entries_per_limb_m_1: usize,
-    limb_index: usize,
-    entries_left: usize,
-    cur_limb: Limb,
-    counter: usize,
-}
-
-impl<'a> FpVectorIterator<'a> {
-    fn new<const P: u32>(vec: SliceP<'a, P>) -> Self {
-        let counter = vec.len();
-        let limbs = &vec.limbs;
-
-        if counter == 0 {
-            return Self {
-                limbs,
-                bit_length: 0,
-                entries_per_limb_m_1: 0,
-                bit_mask: 0,
-                limb_index: 0,
-                entries_left: 0,
-                cur_limb: 0,
-                counter,
-            };
-        }
-        let pair = limb::limb_bit_index_pair::<P>(vec.start);
-
-        let bit_length = limb::bit_length_const::<P>();
-        let cur_limb = limbs[pair.limb] >> pair.bit_index;
-
-        let entries_per_limb = limb::entries_per_limb_const::<P>();
-        Self {
-            limbs,
-            bit_length,
-            entries_per_limb_m_1: entries_per_limb - 1,
-            bit_mask: limb::bitmask::<P>(),
-            limb_index: pair.limb,
-            entries_left: entries_per_limb - (vec.start % entries_per_limb),
-            cur_limb,
-            counter,
-        }
-    }
-
-    pub fn skip_n(&mut self, mut n: usize) {
-        if n >= self.counter {
-            self.counter = 0;
-            return;
-        }
-        let entries_per_limb = self.entries_per_limb_m_1 + 1;
-        if n < self.entries_left {
-            self.entries_left -= n;
-            self.counter -= n;
-            self.cur_limb >>= self.bit_length * n;
-            return;
-        }
-
-        n -= self.entries_left;
-        self.counter -= self.entries_left;
-        self.entries_left = 0;
-
-        let skip_limbs = n / entries_per_limb;
-        self.limb_index += skip_limbs;
-        self.counter -= skip_limbs * entries_per_limb;
-        n -= skip_limbs * entries_per_limb;
-
-        if n > 0 {
-            self.entries_left = entries_per_limb - n;
-            self.limb_index += 1;
-            self.cur_limb = self.limbs[self.limb_index] >> (n * self.bit_length);
-            self.counter -= n;
-        }
-    }
-}
-
-impl<'a> Iterator for FpVectorIterator<'a> {
-    type Item = u32;
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.counter == 0 {
-            return None;
-        } else if self.entries_left == 0 {
-            self.limb_index += 1;
-            self.cur_limb = self.limbs[self.limb_index];
-            self.entries_left = self.entries_per_limb_m_1;
-        } else {
-            self.entries_left -= 1;
-        }
-
-        let result = (self.cur_limb & self.bit_mask) as u32;
-        self.counter -= 1;
-        self.cur_limb >>= self.bit_length;
-
-        Some(result)
-    }
-}
-
-impl<'a> ExactSizeIterator for FpVectorIterator<'a> {
-    fn len(&self) -> usize {
-        self.counter
-    }
-}
-
-/// Iterator over non-zero entries of an FpVector. This is monomorphized over P for significant
-/// performance gains.
-pub struct FpVectorNonZeroIteratorP<'a, const P: u32> {
-    limbs: &'a [Limb],
-    limb_index: usize,
-    cur_limb_entries_left: usize,
-    cur_limb: Limb,
-    idx: usize,
-    dim: usize,
-}
-
-impl<'a, const P: u32> FpVectorNonZeroIteratorP<'a, P> {
-    fn new(vec: SliceP<'a, P>) -> Self {
-        let entries_per_limb = limb::entries_per_limb_const::<P>();
-
-        let dim = vec.len();
-        let limbs = vec.limbs;
-
-        if dim == 0 {
-            return Self {
-                limbs,
-                limb_index: 0,
-                cur_limb_entries_left: 0,
-                cur_limb: 0,
-                idx: 0,
-                dim: 0,
-            };
-        }
-        let min_index = vec.start;
-        let pair = limb::limb_bit_index_pair::<P>(min_index);
-        let cur_limb = limbs[pair.limb] >> pair.bit_index;
-        let cur_limb_entries_left = entries_per_limb - (min_index % entries_per_limb);
-        Self {
-            limbs,
-            limb_index: pair.limb,
-            cur_limb_entries_left,
-            cur_limb,
-            idx: 0,
-            dim,
-        }
-    }
-}
-
-impl<'a, const P: u32> Iterator for FpVectorNonZeroIteratorP<'a, P> {
-    type Item = (usize, u32);
-    fn next(&mut self) -> Option<Self::Item> {
-        let bit_length: usize = limb::bit_length_const::<P>();
-        let bitmask: Limb = limb::bitmask::<P>();
-        let entries_per_limb: usize = limb::entries_per_limb_const::<P>();
-        loop {
-            let bits_left = (self.cur_limb_entries_left * bit_length) as u32;
-            #[allow(clippy::unnecessary_cast)]
-            let tz_real = (self.cur_limb | (1 as Limb).checked_shl(bits_left as u32).unwrap_or(0))
-                .trailing_zeros();
-            let tz_rem = ((tz_real as u8) % (bit_length as u8)) as u32;
-            let tz_div = ((tz_real as u8) / (bit_length as u8)) as u32;
-            let tz = tz_real - tz_rem;
-            self.idx += tz_div as usize;
-            if self.idx >= self.dim {
-                return None;
-            }
-            self.cur_limb_entries_left -= tz_div as usize;
-            if self.cur_limb_entries_left == 0 {
-                self.limb_index += 1;
-                self.cur_limb_entries_left = entries_per_limb;
-                self.cur_limb = self.limbs[self.limb_index];
-                continue;
-            }
-            self.cur_limb >>= tz;
-            if tz == 0 {
-                break;
-            }
-        }
-        let result = (self.idx, (self.cur_limb & bitmask) as u32);
-        self.idx += 1;
-        self.cur_limb_entries_left -= 1;
-        self.cur_limb >>= bit_length;
-        Some(result)
-    }
-}
diff --git a/ext/crates/once/src/lib.rs b/ext/crates/once/src/lib.rs
index 80280aa4be..f808ca7254 100644
--- a/ext/crates/once/src/lib.rs
+++ b/ext/crates/once/src/lib.rs
@@ -19,10 +19,10 @@ const USIZE_LEN: u32 = 0usize.count_zeros();
 /// MAX_OUTER_LENGTH is relatively small, so we picked an arbitrary number.
 const MAX_OUTER_LENGTH: usize = 32;
 
-/// This is a wrapper around our out-of-order push tracker. We put it in a box to make it smaller
-/// in size. See [`OnceVec`] documentation for more details.
+/// This is a wrapper around our out-of-order push tracker. See [`OnceVec`] documentation for
+/// more details.
 #[derive(Clone, Default)]
-pub struct OooTracker(Box<BTreeSet<usize>>);
+pub struct OooTracker(BTreeSet<usize>);
 
 const fn inner_index(index: usize) -> (usize, usize) {
     let page = (USIZE_LEN - 1 - (index + 1).leading_zeros()) as usize;
diff --git a/ext/crates/sseq/src/differential.rs b/ext/crates/sseq/src/differential.rs
index 380083f5b9..79b78fc9b2 100644
--- a/ext/crates/sseq/src/differential.rs
+++ b/ext/crates/sseq/src/differential.rs
@@ -1,7 +1,7 @@
 use fp::{
     matrix::{Matrix, Subspace},
     prime::ValidPrime,
-    vector::{FpVector, Slice, SliceMut},
+    vector::{prelude::*, FpVector, Slice, SliceMut},
 };
 
 pub struct Differential {
@@ -74,8 +74,8 @@ impl Differential {
             .filter(|d| !d.is_zero())
             .map(move |d| {
                 (
-                    d.slice(0, source_dim).to_owned(),
-                    d.slice(source_dim, source_dim + target_dim).to_owned(),
+                    d.slice(0, source_dim).into_owned(),
+                    d.slice(source_dim, source_dim + target_dim).into_owned(),
                 )
             })
             .collect()
diff --git a/ext/crates/sseq/src/sseq.rs b/ext/crates/sseq/src/sseq.rs
index a768ca6c10..020c79b98c 100644
--- a/ext/crates/sseq/src/sseq.rs
+++ b/ext/crates/sseq/src/sseq.rs
@@ -4,7 +4,7 @@ use bivec::BiVec;
 use fp::{
     matrix::{Matrix, Subquotient, Subspace},
     prime::ValidPrime,
-    vector::{FpVector, Slice},
+    vector::{prelude::*, FpVector, Slice},
 };
 use std::{marker::PhantomData, sync::Arc};
 
diff --git a/ext/examples/bruner.rs b/ext/examples/bruner.rs
index 82bc54be30..7f6b2c5247 100644
--- a/ext/examples/bruner.rs
+++ b/ext/examples/bruner.rs
@@ -26,7 +26,11 @@ use ext::{
     chain_complex::{ChainComplex, FiniteChainComplex as FCC},
     resolution_homomorphism::ResolutionHomomorphism,
 };
-use fp::{matrix::Matrix, prime::TWO, vector::FpVector};
+use fp::{
+    matrix::Matrix,
+    prime::TWO,
+    vector::{prelude::*, FpVector},
+};
 use std::{
     fs::File,
     io::{BufRead, BufReader},
diff --git a/ext/examples/define_module.rs b/ext/examples/define_module.rs
index 2fc2f2a60e..b952578bec 100644
--- a/ext/examples/define_module.rs
+++ b/ext/examples/define_module.rs
@@ -10,7 +10,7 @@ use algebra::steenrod_evaluator::SteenrodEvaluator;
 use algebra::{AdemAlgebra, Algebra, GeneratedAlgebra};
 use bivec::BiVec;
 use fp::prime::ValidPrime;
-use fp::vector::FpVector;
+use fp::vector::{prelude::*, FpVector};
 
 use anyhow::anyhow;
 
diff --git a/ext/examples/lift_hom.rs b/ext/examples/lift_hom.rs
index 2c608dd17f..945c9638b6 100644
--- a/ext/examples/lift_hom.rs
+++ b/ext/examples/lift_hom.rs
@@ -45,6 +45,7 @@ use ext::chain_complex::{AugmentedChainComplex, ChainComplex, FreeChainComplex};
 use ext::resolution_homomorphism::ResolutionHomomorphism;
 use ext::utils;
 use fp::matrix::Matrix;
+use fp::vector::prelude::*;
 
 use std::path::PathBuf;
 use std::sync::Arc;
diff --git a/ext/examples/massey.rs b/ext/examples/massey.rs
index df5aca680c..288bcd06d1 100644
--- a/ext/examples/massey.rs
+++ b/ext/examples/massey.rs
@@ -6,6 +6,7 @@
 use ext::chain_complex::{ChainComplex, ChainHomotopy, FreeChainComplex};
 use ext::resolution_homomorphism::ResolutionHomomorphism;
 use fp::matrix::{AugmentedMatrix, Matrix};
+use fp::vector::prelude::*;
 use std::sync::Arc;
 
 fn main() -> anyhow::Result<()> {
diff --git a/ext/examples/save_bruner.rs b/ext/examples/save_bruner.rs
index 722b83abe7..bfe3a16a48 100644
--- a/ext/examples/save_bruner.rs
+++ b/ext/examples/save_bruner.rs
@@ -4,6 +4,7 @@
 use algebra::module::Module;
 use algebra::{Algebra, AlgebraType, MilnorAlgebra};
 use ext::{chain_complex::ChainComplex, utils::query_module};
+use fp::vector::prelude::*;
 use itertools::Itertools;
 use std::fmt::Write as _;
 use std::fs::File;
diff --git a/ext/examples/secondary_massey.rs b/ext/examples/secondary_massey.rs
index 28633ad032..cba93906dd 100644
--- a/ext/examples/secondary_massey.rs
+++ b/ext/examples/secondary_massey.rs
@@ -22,7 +22,7 @@ use std::sync::Arc;
 use algebra::module::Module;
 use algebra::pair_algebra::PairAlgebra;
 use fp::matrix::{Matrix, Subspace};
-use fp::vector::FpVector;
+use fp::vector::{prelude::*, FpVector};
 
 use ext::chain_complex::{ChainComplex, ChainHomotopy, FreeChainComplex};
 use ext::resolution_homomorphism::ResolutionHomomorphism;
diff --git a/ext/examples/secondary_product.rs b/ext/examples/secondary_product.rs
index 4a9946a119..e25d13b175 100644
--- a/ext/examples/secondary_product.rs
+++ b/ext/examples/secondary_product.rs
@@ -24,7 +24,7 @@ use std::sync::Arc;
 
 use algebra::module::Module;
 use fp::matrix::Matrix;
-use fp::vector::FpVector;
+use fp::vector::{prelude::*, FpVector};
 
 use ext::chain_complex::{ChainComplex, FreeChainComplex};
 use ext::resolution_homomorphism::ResolutionHomomorphism;
diff --git a/ext/examples/sq0.rs b/ext/examples/sq0.rs
index 413b5ce9dd..fbc5aa0a44 100644
--- a/ext/examples/sq0.rs
+++ b/ext/examples/sq0.rs
@@ -7,7 +7,7 @@ use ext::chain_complex::{
 };
 use ext::resolution_homomorphism::ResolutionHomomorphism;
 use ext::utils;
-use fp::vector::FpVector;
+use fp::vector::{prelude::*, FpVector};
 use itertools::Itertools;
 
 fn main() -> anyhow::Result<()> {
diff --git a/ext/examples/steenrod.rs b/ext/examples/steenrod.rs
index eb0367079f..88a00e1666 100644
--- a/ext/examples/steenrod.rs
+++ b/ext/examples/steenrod.rs
@@ -6,7 +6,7 @@ use ext::chain_complex::{
 use ext::utils;
 use ext::yoneda::yoneda_representative_element;
 use fp::matrix::Matrix;
-use fp::vector::FpVector;
+use fp::vector::{prelude::*, FpVector};
 use itertools::Itertools;
 use tensor_product_chain_complex::TensorChainComplex;
 
@@ -284,7 +284,7 @@ mod sum_module {
 
     use algebra::module::block_structure::{BlockStructure, GeneratorBasisEltPair};
     use algebra::module::{Module, ZeroModule};
-    use fp::vector::SliceMut;
+    use fp::vector::{prelude::*, SliceMut};
 
     use std::sync::Arc;
 
@@ -475,7 +475,7 @@ mod tensor_product_chain_complex {
     use algebra::{Algebra, Bialgebra};
     use ext::chain_complex::ChainComplex;
     use fp::matrix::AugmentedMatrix;
-    use fp::vector::{FpVector, Slice, SliceMut};
+    use fp::vector::{prelude::*, FpVector, Slice, SliceMut};
     use std::sync::Arc;
 
     use once::{OnceBiVec, OnceVec};
diff --git a/ext/src/chain_complex/chain_homotopy.rs b/ext/src/chain_complex/chain_homotopy.rs
index 70c48b3189..3daa24fffe 100644
--- a/ext/src/chain_complex/chain_homotopy.rs
+++ b/ext/src/chain_complex/chain_homotopy.rs
@@ -4,7 +4,7 @@ use crate::save::SaveKind;
 use algebra::module::homomorphism::{FreeModuleHomomorphism, ModuleHomomorphism};
 use algebra::module::Module;
 use fp::prime::ValidPrime;
-use fp::vector::FpVector;
+use fp::vector::{prelude::*, FpVector};
 use once::OnceBiVec;
 
 use std::path::{Path, PathBuf};
diff --git a/ext/src/chain_complex/mod.rs b/ext/src/chain_complex/mod.rs
index c35651b7d2..f76484c434 100644
--- a/ext/src/chain_complex/mod.rs
+++ b/ext/src/chain_complex/mod.rs
@@ -8,7 +8,7 @@ use algebra::{Algebra, MuAlgebra};
 use bivec::BiVec;
 use fp::matrix::Matrix;
 use fp::prime::ValidPrime;
-use fp::vector::{Slice, SliceMut};
+use fp::vector::{prelude::*, Slice, SliceMut};
 use std::sync::Arc;
 
 use itertools::Itertools;
diff --git a/ext/src/nassau.rs b/ext/src/nassau.rs
index 9090c08242..e68dae45ce 100644
--- a/ext/src/nassau.rs
+++ b/ext/src/nassau.rs
@@ -33,7 +33,7 @@ use anyhow::anyhow;
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use fp::matrix::{AugmentedMatrix, Matrix};
 use fp::prime::{ValidPrime, TWO};
-use fp::vector::{FpVector, Slice, SliceMut};
+use fp::vector::{prelude::*, FpVector, Slice, SliceMut};
 use itertools::Itertools;
 use once::OnceVec;
 
@@ -163,8 +163,7 @@ impl MilnorSubalgebra {
             scratch.set_to_zero();
             hom.apply_to_basis_element(scratch.as_slice_mut(), 1, degree, masked_index);
 
-            row.as_slice_mut()
-                .add_masked(scratch.as_slice(), 1, &target_mask);
+            row.add_masked(&scratch, 1, &target_mask);
         }
         result
     }
@@ -511,12 +510,12 @@ impl<M: ZeroModule<Algebra = MilnorAlgebra>> Resolution<M> {
             f.write_u64::<LittleEndian>(next_mask[col] as u64)?;
             let preimage = masked_matrix.row_segment(row as usize, 1, 1);
             scratch.set_scratch_vector_size(preimage.len());
-            scratch.as_slice_mut().assign(preimage);
+            scratch.assign(preimage);
             scratch.to_bytes(f)?;
 
             scratch.set_scratch_vector_size(full_matrix.columns());
             for (i, _) in preimage.iter_nonzero() {
-                scratch.as_slice_mut().add(full_matrix.row(i), 1);
+                scratch.add(full_matrix.row(i), 1);
             }
             scratch.to_bytes(f)?;
         }
@@ -617,8 +616,7 @@ impl<M: ZeroModule<Algebra = MilnorAlgebra>> Resolution<M> {
         let mut dxs = vec![FpVector::new(p, next.dimension(t)); num_new_gens];
 
         for ((x, x_masked), dx) in xs.iter_mut().zip_eq(&n[next_row..]).zip_eq(&mut dxs) {
-            x.as_slice_mut()
-                .add_unmasked(x_masked.as_slice(), 1, &target_mask);
+            x.add_unmasked(x_masked, 1, &target_mask);
             for (i, _) in x_masked.iter_nonzero() {
                 dx.add(&full_matrix[i], 1);
             }
@@ -1005,7 +1003,7 @@ impl<M: ZeroModule<Algebra = MilnorAlgebra>> ChainComplex for Resolution<M> {
         let target = &self.modules[s - 1];
         let algebra = target.algebra();
 
-        let mut inputs: Vec<FpVector> = inputs.iter().map(|x| x.into().to_owned()).collect();
+        let mut inputs: Vec<FpVector> = inputs.iter().map(|x| x.into().into_owned()).collect();
         let mut mask: Vec<usize> = Vec::with_capacity(zero_mask_dim + 8);
         mask.extend(subalgebra.signature_mask(&algebra, source, t, &subalgebra.zero_signature()));
 
@@ -1048,7 +1046,7 @@ impl<M: ZeroModule<Algebra = MilnorAlgebra>> ChainComplex for Resolution<M> {
                 matrix
                     .row_segment_mut(i, 1, 1)
                     .slice_mut(0, dx.len())
-                    .add(dx.as_slice(), 1);
+                    .add(dx, 1);
                 matrix
                     .row_segment_mut(i, 2, 2)
                     .add_basis_element(zero_mask_dim + i, 1);
@@ -1095,7 +1093,7 @@ impl<M: ZeroModule<Algebra = MilnorAlgebra>> ChainComplex for Resolution<M> {
                             output
                                 .into()
                                 .add_unmasked(dx_matrix.row_segment(i, 2, 2), 1, &mask);
-                            input.as_slice_mut().add(dx_matrix.row_segment(i, 1, 1), 1);
+                            input.add(dx_matrix.row_segment(i, 1, 1), 1);
                         }
                     }
                 }
@@ -1111,9 +1109,7 @@ impl<M: ZeroModule<Algebra = MilnorAlgebra>> ChainComplex for Resolution<M> {
                     if entry != 0 {
                         output.into().add_unmasked(scratch0.as_slice(), 1, &mask);
                         // If we resume a resolve_through_stem, input may be longer than scratch1.
-                        input
-                            .slice_mut(0, scratch1.len())
-                            .add(scratch1.as_slice(), 1);
+                        input.slice_mut(0, scratch1.len()).add(&scratch1, 1);
                     }
                 }
 
@@ -1124,11 +1120,11 @@ impl<M: ZeroModule<Algebra = MilnorAlgebra>> ChainComplex for Resolution<M> {
                             dx_matrix
                                 .row_segment_mut(i, 2, 2)
                                 .slice_mut(0, zero_mask_dim)
-                                .add(scratch0.as_slice(), 1);
+                                .add(&scratch0, 1);
                             dx_matrix
                                 .row_segment_mut(i, 1, 1)
                                 .slice_mut(0, target_dim)
-                                .add(scratch1.as_slice(), 1);
+                                .add(&scratch1, 1);
                         }
                     }
                 }
diff --git a/ext/src/resolution.rs b/ext/src/resolution.rs
index 8d27889819..6d26278a53 100644
--- a/ext/src/resolution.rs
+++ b/ext/src/resolution.rs
@@ -10,7 +10,7 @@ use algebra::module::homomorphism::{ModuleHomomorphism, MuFreeModuleHomomorphism
 use algebra::module::{Module, MuFreeModule};
 use algebra::{Algebra, MuAlgebra};
 use fp::matrix::{AugmentedMatrix, QuasiInverse, Subspace};
-use fp::vector::{FpVector, Slice, SliceMut};
+use fp::vector::{prelude::*, FpVector, Slice, SliceMut};
 use once::OnceVec;
 
 use std::path::{Path, PathBuf};
diff --git a/ext/src/resolution_homomorphism.rs b/ext/src/resolution_homomorphism.rs
index 289857d020..c870cdd0cd 100644
--- a/ext/src/resolution_homomorphism.rs
+++ b/ext/src/resolution_homomorphism.rs
@@ -12,7 +12,7 @@ use algebra::module::homomorphism::{ModuleHomomorphism, MuFreeModuleHomomorphism
 use algebra::module::Module;
 use algebra::MuAlgebra;
 use fp::matrix::Matrix;
-use fp::vector::{FpVector, SliceMut};
+use fp::vector::{prelude::*, FpVector, SliceMut};
 use once::OnceBiVec;
 
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
@@ -494,7 +494,7 @@ where
         let source_t = t + self.shift_t;
 
         assert_eq!(
-            result.as_slice().len(),
+            result.len(),
             self.source
                 .module(source_s)
                 .number_of_gens_in_degree(source_t)
@@ -504,7 +504,7 @@ where
 
         let map = self.get_map(source_s);
         let j = target_module.operation_generator_to_index(0, 0, t, idx);
-        for i in 0..result.as_slice().len() {
+        for i in 0..result.len() {
             result.add_basis_element(i, coef * map.output(source_t, i).entry(j));
         }
     }
diff --git a/ext/src/secondary.rs b/ext/src/secondary.rs
index d45da1e18c..a97c83591e 100644
--- a/ext/src/secondary.rs
+++ b/ext/src/secondary.rs
@@ -10,7 +10,7 @@ use algebra::Algebra;
 use bivec::BiVec;
 use fp::matrix::Matrix;
 use fp::prime::ValidPrime;
-use fp::vector::{FpVector, Slice, SliceMut};
+use fp::vector::{prelude::*, FpVector, Slice, SliceMut};
 use once::OnceBiVec;
 
 use std::io::{Read, Write};
@@ -1087,7 +1087,7 @@ where
 
                 let extra = *v / *p;
                 out.slice_mut(source_num_gens, source_num_gens + tau_num_gens)
-                    .add(mp[i].as_slice(), (extra * filtration_one_sign) % *p);
+                    .add(&mp[i], (extra * filtration_one_sign) % *p);
             }
             if let Some(page_data) = page_data {
                 page_data.reduce_by_quotient(
diff --git a/ext/src/utils.rs b/ext/src/utils.rs
index c8a407ec4c..914a009c02 100644
--- a/ext/src/utils.rs
+++ b/ext/src/utils.rs
@@ -9,6 +9,7 @@ use algebra::module::{steenrod_module, FDModule, Module, SteenrodModule};
 use algebra::{AlgebraType, MilnorAlgebra, SteenrodAlgebra};
 
 use anyhow::{anyhow, Context};
+use fp::vector::prelude::*;
 use serde_json::Value;
 
 use std::convert::{TryFrom, TryInto};
diff --git a/ext/src/yoneda.rs b/ext/src/yoneda.rs
index 26efd3d469..dc45a72fa9 100644
--- a/ext/src/yoneda.rs
+++ b/ext/src/yoneda.rs
@@ -12,7 +12,7 @@ use algebra::module::{FDModule, FreeModule, Module};
 use algebra::{AdemAlgebra, Algebra, GeneratedAlgebra, MilnorAlgebra, SteenrodAlgebra};
 
 use fp::matrix::{AugmentedMatrix, Matrix, Subspace};
-use fp::vector::FpVector;
+use fp::vector::{prelude::*, FpVector};
 
 use bivec::BiVec;
 
diff --git a/ext/tests/extend_identity.rs b/ext/tests/extend_identity.rs
index 7fe986ff7a..6e787d0542 100644
--- a/ext/tests/extend_identity.rs
+++ b/ext/tests/extend_identity.rs
@@ -3,7 +3,7 @@ use algebra::module::Module;
 use ext::chain_complex::{AugmentedChainComplex, ChainComplex};
 use ext::resolution_homomorphism::ResolutionHomomorphism;
 use ext::utils::{construct, Config};
-use fp::vector::FpVector;
+use fp::vector::{prelude::*, FpVector};
 use serde_json::json;
 use std::convert::TryInto;
 use std::sync::Arc;
diff --git a/web_ext/sseq_gui/src/actions.rs b/web_ext/sseq_gui/src/actions.rs
index 27451d703b..a71a43e356 100644
--- a/web_ext/sseq_gui/src/actions.rs
+++ b/web_ext/sseq_gui/src/actions.rs
@@ -4,7 +4,7 @@ use algebra::module::Module;
 use bivec::BiVec;
 use enum_dispatch::enum_dispatch;
 use ext::{chain_complex::FreeChainComplex, CCC};
-use fp::vector::FpVector;
+use fp::vector::{prelude::*, FpVector};
 use itertools::Itertools;
 use serde::{Deserialize, Serialize};
 
diff --git a/web_ext/sseq_gui/src/resolution_wrapper.rs b/web_ext/sseq_gui/src/resolution_wrapper.rs
index 98f3256db0..4d2aa68c0c 100644
--- a/web_ext/sseq_gui/src/resolution_wrapper.rs
+++ b/web_ext/sseq_gui/src/resolution_wrapper.rs
@@ -10,6 +10,7 @@ use ext::chain_complex::{AugmentedChainComplex, ChainComplex, FreeChainComplex};
 use ext::resolution::Resolution as ResolutionInner;
 use fp::matrix::Matrix;
 use fp::prime::ValidPrime;
+use fp::vector::prelude::*;
 use once::{OnceBiVec, OnceVec};
 
 use ext::resolution_homomorphism::ResolutionHomomorphism as ResolutionHomomorphism_;
diff --git a/web_ext/sseq_gui/src/sseq.rs b/web_ext/sseq_gui/src/sseq.rs
index 6770247b00..bc35f59d50 100644
--- a/web_ext/sseq_gui/src/sseq.rs
+++ b/web_ext/sseq_gui/src/sseq.rs
@@ -5,7 +5,7 @@ use fp::prime::ValidPrime;
 use fp::vector::FpVector;
 use fp::{
     matrix::{Matrix, Subquotient},
-    vector::Slice,
+    vector::{prelude::*, Slice},
 };
 use serde::{Deserialize, Serialize};
 use sseq::{Adams, Sseq, SseqProfile};
@@ -283,7 +283,7 @@ impl<P: SseqProfile> SseqWrapper<P> {
                     .inner
                     .page_data(x, y)
                     .iter()
-                    .map(|x| x.gens().map(Slice::to_owned).collect())
+                    .map(|x| x.gens().map(Slice::into_owned).collect())
                     .collect::<Vec<Vec<FpVector>>>(),
             }),
         });
diff --git a/web_ext/steenrod_calculator/Cargo.toml b/web_ext/steenrod_calculator/Cargo.toml
index 8bc71d42df..b0720bf162 100644
--- a/web_ext/steenrod_calculator/Cargo.toml
+++ b/web_ext/steenrod_calculator/Cargo.toml
@@ -6,6 +6,7 @@ edition = "2021"
 
 [dependencies]
 algebra = { path = "../../ext/crates/algebra", default-features = false, features = ["odd-primes"] }
+fp = { path = "../../ext/crates/fp", default-features = false, features = ["odd-primes"] }
 wasm-bindgen = "=0.2.78"
 
 [lib]
diff --git a/web_ext/steenrod_calculator/src/lib.rs b/web_ext/steenrod_calculator/src/lib.rs
index 4f06f2fc28..8201923911 100644
--- a/web_ext/steenrod_calculator/src/lib.rs
+++ b/web_ext/steenrod_calculator/src/lib.rs
@@ -1,5 +1,6 @@
 use algebra::steenrod_evaluator::SteenrodEvaluator;
 use algebra::Algebra;
+use fp::vector::prelude::*;
 use wasm_bindgen::prelude::*;
 
 #[wasm_bindgen]