Huici, Daniel; Rodríguez, Ricardo J.; Mena, Eduardo
APOTHEOSIS: An efficient approximate similarity search system Journal Article
In: SoftwareX, vol. 29, pp. 102016, 2025, ISSN: 2352-7110.
Abstract | Links | BibTeX | Tags: Approximate K-nearest neighbors, Approximate matching, Approximate search methods, Data similarity analysis, similarity digest algorithms
@article{HuiciRM-SoftX-25,
title = {APOTHEOSIS: An efficient approximate similarity search system},
author = {Daniel Huici and Ricardo J. Rodríguez and Eduardo Mena},
url = {https://webdiis.unizar.es/~ricardo/files/papers/HuiciRM-SoftX-25.pdf},
doi = {10.1016/j.softx.2024.102016},
issn = {2352-7110},
year = {2025},
date = {2025-02-01},
urldate = {2025-02-01},
journal = {SoftwareX},
volume = {29},
pages = {102016},
abstract = {APOTHEOSIS is a tool for efficiently identifying and comparing data similarity in large datasets, addressing challenges faced by traditional methods such as scalability and speed. APOTHEOSIS overcomes them by combining advanced algorithms and data structures, enabling fast and accurate similarity analysis. Specifically, it uses a custom hierarchical small navigation world as an approximate $K$-nearest neighbors search method, and approximate similarity digests algorithms to find common features between similar data items, also supporting various distance metrics beyond vector-based approaches. Our software tool is designed for seamless integration into research workflows, improving reproducibility and facilitating the comparison of large-scale, high-dimensional data comparison across multiple domains.},
keywords = {Approximate K-nearest neighbors, Approximate matching, Approximate search methods, Data similarity analysis, similarity digest algorithms},
pubstate = {published},
tppubtype = {article}
}
Huici, Daniel; Rodríguez, Ricardo J.; Mena, Eduardo
An Extensible and Scalable System for Hash Lookup and Approximate Similarity Search with Similarity Digest Algorithms Journal Article
In: Forensic Science International: Digital Investigation, vol. 53, pp. 301930, 2025, ISSN: 2666-2817, (DFRWS USA 2025 - Selected Papers from the 25th Annual Digital Forensics Research Conference USA).
Abstract | Links | BibTeX | Tags: Approximate matching, hash lookup, similarity digest algorithms, Similarity hashing, similarity search
@article{HuiciRM-FSIDI-25,
title = {An Extensible and Scalable System for Hash Lookup and Approximate Similarity Search with Similarity Digest Algorithms},
author = {Daniel Huici and Ricardo J. Rodríguez and Eduardo Mena},
url = {https://webdiis.unizar.es/~ricardo/files/papers/HuiciRM-FSIDI-25.pdf},
doi = {10.1016/j.fsidi.2025.301930},
issn = {2666-2817},
year = {2025},
date = {2025-07-01},
journal = {Forensic Science International: Digital Investigation},
volume = {53},
pages = {301930},
abstract = {Efficient management and analysis of large volumes of digital data has emerged as a major challenge in the field of digital forensics. To quickly identify and analyze relevant artifacts within large datasets, we introduce tt Apotheosis, an approximate similarity search system designed for scalability and efficiency. Our system integrates approximate search techniques (which allow searching for a match on a close value) with Similarity Digest Algorithms (SDA; which capture common features between similar elements), using a space-saving radix tree and a graph-based hierarchical navigable small world structure to perform fast approximate nearest neighbor searches. We demonstrate the effectiveness and versatility of our system through two key case studies: first, in plagiarism detection, demonstrating the effectiveness of our system in identifying similar or duplicate documents within a large source code dataset; then, in memory artifact detection, showing its scalability and performance in processing large-scale forensic data collected from various versions of Microsoft Windows. Our comprehensive evaluation shows that tt Apotheosis not only efficiently handles large datasets, but also provides a way to evaluate the performance of various SDA and their approximate similarity search in different forensic scenarios.},
note = {DFRWS USA 2025 - Selected Papers from the 25th Annual Digital Forensics Research Conference USA},
keywords = {Approximate matching, hash lookup, similarity digest algorithms, Similarity hashing, similarity search},
pubstate = {published},
tppubtype = {article}
}
Huici, Daniel; Rodríguez, Ricardo J.
A Dataset of Windows System Binaries and Similarity Digests for Enhanced Forensic Analysis Journal Article
In: Data in Brief, vol. PP, no. PP, pp. PP, 2025, ISSN: 2352-3409, (Accepted for publication. To appear.).
Abstract | Links | BibTeX | Tags: Approximate matching, forensic artifacts, Malware Detection, operating system Windows, Similarity digest algorithm, Static Analysis, system binaries
@article{HuiciR-DIB-25b,
title = {A Dataset of Windows System Binaries and Similarity Digests for Enhanced Forensic Analysis},
author = {Daniel Huici and Ricardo J. Rodríguez},
url = {https://webdiis.unizar.es/~ricardo/files/papers/HuiciR-DIB-25.pdf},
issn = {2352-3409},
year = {2025},
date = {2025-01-01},
journal = {Data in Brief},
volume = {PP},
number = {PP},
pages = {PP},
abstract = {Similarity digest algorithms, such as TLSH, ssdeep, or sdhash, to name a few, generate intermediate representations (i.e., digests) of digital artifacts to efficiently identify similar objects and measure their degree of similarity. This dataset provides the results of a static analysis performed on system binary files extracted from multiple versions of the Windows operating system, accompanied by their similarity digests. An automated static analysis process was applied to all extracted binaries to decompose them into individual functions and capture detailed metadata for each of them. Specifically, similarity hashes (in particular, TLSH, ssdeep, and LZJD) were computed to enable forensic analysts to effectively assess artifact similarities. The dataset serves as an “allow list” of legitimate Windows artifacts, allowing forensic analysts to detect deviations from trusted binaries, verify system integrity, perform software audits, and improve malware detection efforts. This paper describes the structure of the dataset, the methodology and tools used in its creation, and its value for forensic analysis and cybersecurity investigation.},
note = {Accepted for publication. To appear.},
keywords = {Approximate matching, forensic artifacts, Malware Detection, operating system Windows, Similarity digest algorithm, Static Analysis, system binaries},
pubstate = {published},
tppubtype = {article}
}
Martín-Pérez, Miguel; Rodríguez, Ricardo J; Breitinger, Frank
Bringing Order to Approximate Matching: Classification and Attacks on Similarity Digest Algorithms Journal Article
In: Forensic Science International: Digital Investigation, vol. 36, pp. 301120, 2021, ISSN: 2666-2817.
Abstract | Links | BibTeX | Tags: Approximate matching, Bytewise, Classification scheme, Fuzzy hashing, Similarity digest algorithm, Similarity hashing
@article{MRB-FSIDI-21,
title = {Bringing Order to Approximate Matching: Classification and Attacks on Similarity Digest Algorithms},
author = {Miguel Martín-Pérez and Ricardo J Rodríguez and Frank Breitinger},
url = {http://webdiis.unizar.es/~ricardo/files/papers/MRB-FSIDI-21.pdf},
doi = {10.1016/j.fsidi.2021.301120},
issn = {2666-2817},
year = {2021},
date = {2021-01-01},
journal = {Forensic Science International: Digital Investigation},
volume = {36},
pages = {301120},
abstract = {Bytewise approximate matching algorithms (a.k.a.~fuzzy hashing or similarity hashing) convert digital artifacts into an intermediate representation to allow a faster comparison them. They gained a lot of popularity over the past decade with new algorithms being developed and released to the digital forensics community. When releasing algorithms (e.g., as part of a scientific article), they are frequently compared with other algorithms to outline the benefits and sometimes also the weaknesses of the proposed approach. However, given the wide variety of algorithms and approaches, it is impossible to provide direct comparisons with all existing algorithms.
In this paper, we present the first classification of approximate matching algorithms which allows an easier description and comparisons.
Therefore, we first reviewed existing literature to understand the techniques various algorithms use and to familiarize ourselves with the common terminology. Our findings allowed us to develop a categorization relying heavily on the terminology proposed by NIST SP 800-168. In addition to the categorization, this article also presents an abstract set of attacks against algorithms and why they are feasible. Lastly, we detail the characteristics needed to build robust algorithms to prevent attacks. We believe that this article helps newcomers, practitioners, and experts alike to better compare algorithms, understand their potential, as well as characteristics and implications they may have on forensic investigations.},
keywords = {Approximate matching, Bytewise, Classification scheme, Fuzzy hashing, Similarity digest algorithm, Similarity hashing},
pubstate = {published},
tppubtype = {article}
}
In this paper, we present the first classification of approximate matching algorithms which allows an easier description and comparisons.
Therefore, we first reviewed existing literature to understand the techniques various algorithms use and to familiarize ourselves with the common terminology. Our findings allowed us to develop a categorization relying heavily on the terminology proposed by NIST SP 800-168. In addition to the categorization, this article also presents an abstract set of attacks against algorithms and why they are feasible. Lastly, we detail the characteristics needed to build robust algorithms to prevent attacks. We believe that this article helps newcomers, practitioners, and experts alike to better compare algorithms, understand their potential, as well as characteristics and implications they may have on forensic investigations.