Huici, Daniel; Rodríguez, Ricardo J.; Mena, Eduardo
APOTHEOSIS: An efficient approximate similarity search system Journal Article
In: SoftwareX, vol. 29, pp. 102016, 2025, ISSN: 2352-7110.
Abstract | Links | BibTeX | Tags: Approximate K-nearest neighbors, Approximate matching, Approximate search methods, Data similarity analysis, similarity digest algorithms
@article{HuiciRM-SoftX-25,
title = {APOTHEOSIS: An efficient approximate similarity search system},
author = {Daniel Huici and Ricardo J. Rodríguez and Eduardo Mena},
url = {https://webdiis.unizar.es/~ricardo/files/papers/HuiciRM-SoftX-25.pdf},
doi = {10.1016/j.softx.2024.102016},
issn = {2352-7110},
year = {2025},
date = {2025-02-01},
urldate = {2025-02-01},
journal = {SoftwareX},
volume = {29},
pages = {102016},
abstract = {APOTHEOSIS is a tool for efficiently identifying and comparing data similarity in large datasets, addressing challenges faced by traditional methods such as scalability and speed. APOTHEOSIS overcomes them by combining advanced algorithms and data structures, enabling fast and accurate similarity analysis. Specifically, it uses a custom hierarchical small navigation world as an approximate $K$-nearest neighbors search method, and approximate similarity digests algorithms to find common features between similar data items, also supporting various distance metrics beyond vector-based approaches. Our software tool is designed for seamless integration into research workflows, improving reproducibility and facilitating the comparison of large-scale, high-dimensional data comparison across multiple domains.},
keywords = {Approximate K-nearest neighbors, Approximate matching, Approximate search methods, Data similarity analysis, similarity digest algorithms},
pubstate = {published},
tppubtype = {article}
}
Huici, Daniel; Rodríguez, Ricardo J.; Mena, Eduardo
An Extensible and Scalable System for Hash Lookup and Approximate Similarity Search with Similarity Digest Algorithms Journal Article
In: Forensic Science International: Digital Investigation, vol. 53, pp. 301930, 2025, ISSN: 2666-2817, (DFRWS USA 2025 - Selected Papers from the 25th Annual Digital Forensics Research Conference USA).
Abstract | Links | BibTeX | Tags: Approximate matching, hash lookup, similarity digest algorithms, Similarity hashing, similarity search
@article{HuiciRM-FSIDI-25,
title = {An Extensible and Scalable System for Hash Lookup and Approximate Similarity Search with Similarity Digest Algorithms},
author = {Daniel Huici and Ricardo J. Rodríguez and Eduardo Mena},
url = {https://webdiis.unizar.es/~ricardo/files/papers/HuiciRM-FSIDI-25.pdf},
doi = {10.1016/j.fsidi.2025.301930},
issn = {2666-2817},
year = {2025},
date = {2025-07-01},
journal = {Forensic Science International: Digital Investigation},
volume = {53},
pages = {301930},
abstract = {Efficient management and analysis of large volumes of digital data has emerged as a major challenge in the field of digital forensics. To quickly identify and analyze relevant artifacts within large datasets, we introduce tt Apotheosis, an approximate similarity search system designed for scalability and efficiency. Our system integrates approximate search techniques (which allow searching for a match on a close value) with Similarity Digest Algorithms (SDA; which capture common features between similar elements), using a space-saving radix tree and a graph-based hierarchical navigable small world structure to perform fast approximate nearest neighbor searches. We demonstrate the effectiveness and versatility of our system through two key case studies: first, in plagiarism detection, demonstrating the effectiveness of our system in identifying similar or duplicate documents within a large source code dataset; then, in memory artifact detection, showing its scalability and performance in processing large-scale forensic data collected from various versions of Microsoft Windows. Our comprehensive evaluation shows that tt Apotheosis not only efficiently handles large datasets, but also provides a way to evaluate the performance of various SDA and their approximate similarity search in different forensic scenarios.},
note = {DFRWS USA 2025 - Selected Papers from the 25th Annual Digital Forensics Research Conference USA},
keywords = {Approximate matching, hash lookup, similarity digest algorithms, Similarity hashing, similarity search},
pubstate = {published},
tppubtype = {article}
}
Martín-Pérez, Miguel; Rodríguez, Ricardo J; Balzarotti, Davide
Pre-processing Memory Dumps to Improve Similarity Score of Windows Modules Journal Article
In: Computers & Security, vol. 101, pp. 102119, 2021, ISSN: 0167-4048.
Abstract | Links | BibTeX | Tags: memory forensics, relocation, similarity digest algorithms, Windows
@article{MRB-COSE-21,
title = {Pre-processing Memory Dumps to Improve Similarity Score of Windows Modules},
author = {Miguel Martín-Pérez and Ricardo J Rodríguez and Davide Balzarotti},
url = {http://webdiis.unizar.es/~ricardo/files/papers/MRB-COSE-21.pdf},
doi = {10.1016/j.cose.2020.102119},
issn = {0167-4048},
year = {2021},
date = {2021-01-01},
journal = {Computers & Security},
volume = {101},
pages = {102119},
abstract = {Memory forensics is useful to provide a fast triage on running processes at the time of memory acquisition in order to avoid unnecessary forensic analysis. However, due to the effects of the execution of the process itself, traditional cryptographic hashes, normally used in disk forensics to identify files, are unsuitable in memory forensics. Similarity digest algorithms allow an analyst to compute a similarity score of inputs that can be slightly different. In this paper, we focus on the issues caused by relocation of Windows processes and system libraries when computing similarities between them. To overcome these issues, we introduce two methods (Guided De-relocation and Linear Sweep De-relocation) to pre-process a memory dump. The goal of both methods is to identify and undo the effect of relocation in every module contained in the dump, providing sanitized inputs to similarity digest algorithms that improve similarity scores between modules. Guided De-relocation relies on specific structures of the Windows PE format, while Linear Sweep De-relocation relies on a disassembling process to identify assembly instructions having memory operands that address to the memory range of the module. We have integrated both methods in a Volatility plugin and evaluated them in different scenarios. Our results demonstrate that pre-processing memory dumps with these methods significantly improves similarity scores between memory modules.},
keywords = {memory forensics, relocation, similarity digest algorithms, Windows},
pubstate = {published},
tppubtype = {article}
}