prefetch-npm-deps: fix reproducibility

v1 lockfiles can contain multiple references to the same version of a
package, and these references can contain different `integrity` values,
such as one having SHA-1 and SHA-512, while another just has SHA-512.

Given that HashMap iteration order isn't defined, this causes
reproducibility issues, as a different integrity value could be chosen
each time.

Thanks to @lilyinstarlight for discovering this issue originally, as well
as the idea for the sorting-based implementation.
This commit is contained in:
Winter 2023-01-28 12:48:24 -05:00 committed by Lily Foster
parent d6b863fd9b
commit 7efebca89c
No known key found for this signature in database
GPG key ID: 49340081E484C893
3 changed files with 151 additions and 47 deletions

View file

@ -105,7 +105,7 @@ fn main() -> anyhow::Result<()> {
eprintln!("{}", package.name);
let tarball = package.tarball()?;
let integrity = package.integrity();
let integrity = package.integrity().map(ToString::to_string);
cache
.put(

View file

@ -1,7 +1,14 @@
use anyhow::{bail, Context};
use anyhow::{anyhow, bail, Context};
use rayon::slice::ParallelSliceMut;
use serde::Deserialize;
use std::{collections::HashMap, fmt};
use serde::{
de::{self, Visitor},
Deserialize, Deserializer,
};
use std::{
cmp::Ordering,
collections::{HashMap, HashSet},
fmt,
};
use url::Url;
pub(super) fn packages(content: &str) -> anyhow::Result<Vec<Package>> {
@ -33,6 +40,13 @@ pub(super) fn packages(content: &str) -> anyhow::Result<Vec<Package>> {
x.resolved
.partial_cmp(&y.resolved)
.expect("resolved should be comparable")
.then(
// v1 lockfiles can contain multiple references to the same version of a package, with
// different integrity values (e.g. a SHA-1 and a SHA-512 in one, but just a SHA-512 in another)
y.integrity
.partial_cmp(&x.integrity)
.expect("integrity should be comparable"),
)
});
packages.dedup_by(|x, y| x.resolved == y.resolved);
@ -54,7 +68,7 @@ struct OldPackage {
#[serde(default)]
bundled: bool,
resolved: Option<UrlOrString>,
integrity: Option<String>,
integrity: Option<HashCollection>,
dependencies: Option<HashMap<String, OldPackage>>,
}
@ -63,7 +77,7 @@ pub(super) struct Package {
#[serde(default)]
pub(super) name: Option<String>,
pub(super) resolved: Option<UrlOrString>,
pub(super) integrity: Option<String>,
pub(super) integrity: Option<HashCollection>,
}
#[derive(Debug, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
@ -82,6 +96,102 @@ impl fmt::Display for UrlOrString {
}
}
#[derive(Debug, PartialEq, Eq)]
pub(super) struct HashCollection(HashSet<Hash>);
impl HashCollection {
pub(super) fn into_best(self) -> Option<Hash> {
self.0.into_iter().max()
}
}
impl PartialOrd for HashCollection {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
let lhs = self.0.iter().max()?;
let rhs = other.0.iter().max()?;
lhs.partial_cmp(rhs)
}
}
impl<'de> Deserialize<'de> for HashCollection {
fn deserialize<D>(deserializer: D) -> Result<HashCollection, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_string(HashCollectionVisitor)
}
}
struct HashCollectionVisitor;
impl<'de> Visitor<'de> for HashCollectionVisitor {
type Value = HashCollection;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a single SRI hash or a collection of them (separated by spaces)")
}
fn visit_str<E>(self, value: &str) -> Result<HashCollection, E>
where
E: de::Error,
{
let hashes = value
.split_ascii_whitespace()
.map(Hash::new)
.collect::<anyhow::Result<_>>()
.map_err(E::custom)?;
Ok(HashCollection(hashes))
}
}
#[derive(Debug, Deserialize, PartialEq, Eq, Hash)]
pub struct Hash(String);
// Hash algorithms, in ascending preference.
const ALGOS: &[&str] = &["sha1", "sha512"];
impl Hash {
fn new(s: impl AsRef<str>) -> anyhow::Result<Hash> {
let algo = s
.as_ref()
.split_once('-')
.ok_or_else(|| anyhow!("expected SRI hash, got {:?}", s.as_ref()))?
.0;
if ALGOS.iter().any(|&a| algo == a) {
Ok(Hash(s.as_ref().to_string()))
} else {
Err(anyhow!("unknown hash algorithm {algo:?}"))
}
}
}
impl fmt::Display for Hash {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.0.fmt(f)
}
}
impl PartialOrd for Hash {
fn partial_cmp(&self, other: &Hash) -> Option<Ordering> {
let lhs = self.0.split_once('-')?.0;
let rhs = other.0.split_once('-')?.0;
ALGOS
.iter()
.position(|&s| lhs == s)?
.partial_cmp(&ALGOS.iter().position(|&s| rhs == s)?)
}
}
impl Ord for Hash {
fn cmp(&self, other: &Hash) -> Ordering {
self.partial_cmp(other).unwrap()
}
}
#[allow(clippy::case_sensitive_file_extension_comparisons)]
fn to_new_packages(
old_packages: HashMap<String, OldPackage>,
@ -149,8 +259,13 @@ fn get_initial_url() -> anyhow::Result<Url> {
#[cfg(test)]
mod tests {
use super::{get_initial_url, to_new_packages, OldPackage, Package, UrlOrString};
use std::collections::HashMap;
use super::{
get_initial_url, to_new_packages, Hash, HashCollection, OldPackage, Package, UrlOrString,
};
use std::{
cmp::Ordering,
collections::{HashMap, HashSet},
};
use url::Url;
#[test]
@ -188,4 +303,23 @@ mod tests {
Ok(())
}
#[test]
fn hash_preference() {
assert_eq!(
Hash(String::from("sha1-foo")).partial_cmp(&Hash(String::from("sha512-foo"))),
Some(Ordering::Less)
);
assert_eq!(
HashCollection({
let mut set = HashSet::new();
set.insert(Hash(String::from("sha512-foo")));
set.insert(Hash(String::from("sha1-bar")));
set
})
.into_best(),
Some(Hash(String::from("sha512-foo")))
);
}
}

View file

@ -87,7 +87,7 @@ pub struct Package {
#[derive(Debug)]
enum Specifics {
Registry { integrity: String },
Registry { integrity: lock::Hash },
Git { workdir: TempDir },
}
@ -134,11 +134,11 @@ impl Package {
Specifics::Git { workdir }
}
None => Specifics::Registry {
integrity: get_ideal_hash(
&pkg.integrity
.expect("non-git dependencies should have assosciated integrity"),
)?
.to_string(),
integrity: pkg
.integrity
.expect("non-git dependencies should have assosciated integrity")
.into_best()
.expect("non-git dependencies should have non-empty assosciated integrity"),
},
};
@ -181,9 +181,9 @@ impl Package {
}
}
pub fn integrity(&self) -> Option<String> {
pub fn integrity(&self) -> Option<&lock::Hash> {
match &self.specifics {
Specifics::Registry { integrity } => Some(integrity.clone()),
Specifics::Registry { integrity } => Some(integrity),
Specifics::Git { .. } => None,
}
}
@ -304,25 +304,9 @@ fn get_hosted_git_url(url: &Url) -> anyhow::Result<Option<Url>> {
}
}
fn get_ideal_hash(integrity: &str) -> anyhow::Result<&str> {
let split: Vec<_> = integrity.split_ascii_whitespace().collect();
if split.len() == 1 {
Ok(split[0])
} else {
for hash in ["sha512-", "sha1-"] {
if let Some(h) = split.iter().find(|s| s.starts_with(hash)) {
return Ok(h);
}
}
Err(anyhow!("not sure which hash to select out of {split:?}"))
}
}
#[cfg(test)]
mod tests {
use super::{get_hosted_git_url, get_ideal_hash};
use super::get_hosted_git_url;
use url::Url;
#[test]
@ -353,18 +337,4 @@ mod tests {
"GitLab URLs should be marked as invalid (lol)"
);
}
#[test]
fn ideal_hashes() {
for (input, expected) in [
("sha512-foo sha1-bar", Some("sha512-foo")),
("sha1-bar md5-foo", Some("sha1-bar")),
("sha1-bar", Some("sha1-bar")),
("sha512-foo", Some("sha512-foo")),
("foo-bar sha1-bar", Some("sha1-bar")),
("foo-bar baz-foo", None),
] {
assert_eq!(get_ideal_hash(input).ok(), expected);
}
}
}