Skip to content

Commit 2b3bb07

Browse files
committed
0.3.5
1 parent c89ee15 commit 2b3bb07

26 files changed

Lines changed: 565 additions & 286 deletions

ChangeLog

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
Version 0.3.3 (2025-10-04)
2-
* (Performance) Implement Ukkonen style banding to limit DP calculations to near diagonals
1+
Version 0.3.5 (2025-10-19)
2+
* (Performance) Specialized fast `single_gap_search` algorithm, see `RadixTree$single_gap_search`
33

4-
Version 0.3.2 (2025-10-03)
5-
* (Performance) Track DP column min to avoid redundant calculation
4+
Version 0.3.3 (2025-10-04)
5+
* (Performance) Implement Ukkonen style banding to limit DP calculations to near diagonals, improves performance 30-40%
66

77
Version 0.3.1 (2025-10-03)
88
* (Performance) Reworked trie alignment searches to reuse dynamic-programming workspaces
@@ -24,3 +24,4 @@ Version 0.2.7 (2024-01-25)
2424

2525
Version 0.2.6 (2024-01-25)
2626
* Minor fix for CRAN checks, std::basic_string<uint8_t> deprecated in Clang 18
27+

DESCRIPTION

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: seqtrie
22
Title: Radix Tree and Trie-Based String Distances
3-
Version: 0.3.4
4-
Date: 2025-10-04
3+
Version: 0.3.5
4+
Date: 2025-10-19
55
Authors@R: c(
66
person("Travers", "Ching", email = "traversc@gmail.com", role = c("aut", "cre", "cph")),
77
person("Martin", "Moene", role = c("ctb", "cph"), comment = "span-lite C++ library"),
@@ -20,7 +20,7 @@ LinkingTo:
2020
Imports:
2121
Rcpp (>= 0.12.18.3), RcppParallel (>= 5.1.3), R6, rlang, dplyr, stringi
2222
Suggests:
23-
knitr, rmarkdown, stringdist, qs, pwalign, igraph, ggplot2
23+
knitr, rmarkdown, stringdist, pwalign, igraph, ggplot2
2424
VignetteBuilder: knitr
2525
RoxygenNote: 7.3.2
2626
Roxygen: list(markdown = TRUE)

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ test:
7474
IS_LOCAL=Yes Rscript tests/test_pairwise.R && unset IS_LOCAL
7575
IS_LOCAL=Yes Rscript tests/test_RadixTree.R && unset IS_LOCAL
7676
IS_LOCAL=Yes Rscript tests/test_RadixForest.R && unset IS_LOCAL
77+
IS_LOCAL=Yes Rscript tests/test_single_gap_search.R && unset IS_LOCAL
7778

7879
test-trie:
7980
IS_LOCAL=Yes Rscript tests/test_RadixTree.R && unset IS_LOCAL

R/RadixForest.R

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,7 @@ RadixForest <- R6::R6Class("RadixForest", list(
141141
}
142142

143143
if(!is.null(max_distance)) {
144-
if(length(max_distance) == 1) {
145-
max_distance <- rep(max_distance, length(query))
146-
}
144+
max_distance <- recycle_arg(max_distance, query)
147145
} else if(!is.null(max_fraction)) {
148146
max_distance <- as.integer(nchar(query) * max_fraction)
149147
} else {

R/RadixTree.R

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -141,15 +141,13 @@ RadixTree <- R6::R6Class("RadixTree", public = list(
141141
#' @param show_progress `r rdoc("show_progress")`
142142
#' @return The output is a data.frame of all matches with columns "query" and "target".
143143
#' For anchored searches, the output also includes attributes "query_size" and "target_size" which are vectors containing the portion of the query and target sequences that are aligned.
144-
search = function(query, max_distance = NULL, max_fraction = NULL, mode = "levenshtein", cost_matrix = NULL, gap_cost = NULL, gap_open_cost = NULL, nthreads = 1, show_progress = FALSE) {
144+
search = function(query, max_distance = NULL, max_fraction = NULL, mode = "levenshtein", cost_matrix = NULL, gap_cost = NA_integer_, gap_open_cost = NA_integer_, nthreads = 1, show_progress = FALSE) {
145145
charset <- unique(c(CharCounter_keys(self$char_counter_pointer), get_charset(query)))
146146
check_alignment_params(mode, cost_matrix, gap_cost, gap_open_cost, charset, diag_must_be_zero = TRUE)
147147
mode <- normalize_mode_parameter(mode)
148148

149149
if (!is.null(max_distance)) {
150-
if (length(max_distance) == 1) {
151-
max_distance <- rep(max_distance, length(query))
152-
}
150+
max_distance <- recycle_arg(max_distance, query)
153151
} else if (!is.null(max_fraction)) {
154152
max_distance <- as.integer(nchar(query) * max_fraction)
155153
} else {
@@ -159,14 +157,12 @@ RadixTree <- R6::R6Class("RadixTree", public = list(
159157
stop("max_distance/max_fraction must be non-negative")
160158
}
161159

162-
# defaults for C++ plain ints
163-
if (is.null(gap_cost)) gap_cost <- 1L
164-
if (is.null(gap_open_cost)) gap_open_cost <- 0L
165-
# Align conventions with pwalign/Biostrings: first gap includes one extension
166-
if (gap_open_cost > 0L) gap_open_cost <- gap_open_cost + gap_cost
167-
result <- RadixTree_search(self$root_pointer, query, max_distance, mode, cost_matrix, as.integer(gap_cost), as.integer(gap_open_cost), nthreads, show_progress)
160+
if (!is.na(gap_open_cost) && !is.na(gap_cost) && gap_open_cost > 0L) {
161+
gap_open_cost <- gap_open_cost + gap_cost
162+
}
163+
result <- RadixTree_search(self$root_pointer, query, max_distance, mode, cost_matrix, gap_cost, gap_open_cost, nthreads, show_progress)
168164
if (mode == "anchored") { # Append query_size and target_size attributes
169-
result2 <- c_dist_pairwise(result$query, result$target, mode, cost_matrix, as.integer(gap_cost), as.integer(gap_open_cost), nthreads, show_progress = FALSE)
165+
result2 <- c_dist_pairwise(result$query, result$target, mode, cost_matrix, gap_cost, gap_open_cost, nthreads, show_progress = FALSE)
170166
if (any(result$distance != result2)) {
171167
stop("Internal error: anchored search results do not match pairwise results")
172168
}
@@ -175,6 +171,29 @@ RadixTree <- R6::R6Class("RadixTree", public = list(
175171
}
176172
result
177173
},
174+
#' @description A specialized algorithm for searching for sequences allowing at most a single gap within the alignment itself. The mode is always "anchored" and does not penalize end gaps.
175+
#' @param query `r rdoc("query")`
176+
#' @param max_distance `r rdoc("max_distance")`
177+
#' @param gap_cost `r rdoc("gap_cost")`
178+
#' @param nthreads `r rdoc("nthreads")`
179+
#' @param show_progress `r rdoc("show_progress")`
180+
#' @return The output is a data.frame of matches with columns "query", "target" and "distance".
181+
single_gap_search = function(query,
182+
max_distance,
183+
gap_cost = 1L,
184+
nthreads = 1,
185+
show_progress = FALSE) {
186+
if (!is.null(max_distance)) {
187+
max_distance <- recycle_arg(max_distance, query)
188+
} else {
189+
stop("Either max_distance must be non-null")
190+
}
191+
if (any(max_distance < 0)) {
192+
stop("max_distance/max_fraction must be non-negative")
193+
}
194+
195+
RadixTree_single_gap_search(self$root_pointer, query, max_distance, gap_cost, nthreads, show_progress)
196+
},
178197
#' @description Validate the tree
179198
#' @return A logical indicating whether the tree is valid (TRUE) or not (FALSE). This is mostly an internal function for debugging purposes and should always return TRUE.
180199
validate = function() {

R/RadixTree_search_helpers.R

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
#' dist_search(c("ACGT", "AAAA"), c("ACG", "ACGT"), max_distance = 1, mode = "levenshtein")
2424
#' @name dist_search
2525
dist_search <- function(query, target, max_distance = NULL, max_fraction = NULL, mode = "levenshtein",
26-
cost_matrix = NULL, gap_cost = NULL, gap_open_cost = NULL, tree_class = "RadixTree",
26+
cost_matrix = NULL, gap_cost = NA_integer_, gap_open_cost = NA_integer_, tree_class = "RadixTree",
2727
nthreads = 1, show_progress = FALSE) {
2828
if (!tree_class %in% c("RadixTree", "RadixForest")) {
2929
stop("tree_class must be one of RadixTree or RadixForest")
@@ -33,7 +33,9 @@ dist_search <- function(query, target, max_distance = NULL, max_fraction = NULL,
3333
obj$insert(target)
3434
obj$search(query, max_distance, max_fraction, mode, cost_matrix, gap_cost, gap_open_cost, nthreads, show_progress)
3535
} else if(tree_class == "RadixForest") {
36-
if(!is.null(cost_matrix) || !is.null(gap_cost) || !is.null(gap_open_cost)) {
36+
gap_cost_provided <- !(length(gap_cost) == 1L && is.na(gap_cost))
37+
gap_open_provided <- !(length(gap_open_cost) == 1L && is.na(gap_open_cost))
38+
if(!is.null(cost_matrix) || gap_cost_provided || gap_open_provided) {
3739
stop("cost_matrix, gap_cost and gap_open_cost are not supported for RadixForest")
3840
}
3941
obj <- RadixForest$new()
@@ -117,4 +119,4 @@ split_search <- function(query, target, query_split, target_split, edge_trim = 0
117119
results <- dplyr::mutate(results, distance = .data$distance.left + .data$distance.right)
118120
results <- dplyr::filter(results, .data$distance <= max_distance)
119121
as.data.frame(results[c("query", "target", "distance")])
120-
}
122+
}

R/RcppExports.R

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,15 +105,19 @@ RadixTree_create <- function() {
105105
.Call(`_seqtrie_RadixTree_create`)
106106
}
107107

108-
RadixTree_search <- function(xp, query, max_distance, mode = "global", cost_matrix = NULL, gap_cost = 1L, gap_open_cost = 0L, nthreads = 1L, show_progress = FALSE) {
108+
RadixTree_search <- function(xp, query, max_distance, mode = "global", cost_matrix = NULL, gap_cost = NA_integer_, gap_open_cost = NA_integer_, nthreads = 1L, show_progress = FALSE) {
109109
.Call(`_seqtrie_RadixTree_search`, xp, query, max_distance, mode, cost_matrix, gap_cost, gap_open_cost, nthreads, show_progress)
110110
}
111111

112-
c_dist_matrix <- function(query, target, mode = "global", cost_matrix = NULL, gap_cost = 1L, gap_open_cost = 0L, nthreads = 1L, show_progress = FALSE) {
112+
RadixTree_single_gap_search <- function(xp, query, max_distance, gap_cost = 1L, nthreads = 1L, show_progress = FALSE) {
113+
.Call(`_seqtrie_RadixTree_single_gap_search`, xp, query, max_distance, gap_cost, nthreads, show_progress)
114+
}
115+
116+
c_dist_matrix <- function(query, target, mode = "global", cost_matrix = NULL, gap_cost = NA_integer_, gap_open_cost = NA_integer_, nthreads = 1L, show_progress = FALSE) {
113117
.Call(`_seqtrie_c_dist_matrix`, query, target, mode, cost_matrix, gap_cost, gap_open_cost, nthreads, show_progress)
114118
}
115119

116-
c_dist_pairwise <- function(query, target, mode = "levenshtein", cost_matrix = NULL, gap_cost = 1L, gap_open_cost = 0L, nthreads = 1L, show_progress = FALSE) {
120+
c_dist_pairwise <- function(query, target, mode = "levenshtein", cost_matrix = NULL, gap_cost = NA_integer_, gap_open_cost = NA_integer_, nthreads = 1L, show_progress = FALSE) {
117121
.Call(`_seqtrie_c_dist_pairwise`, query, target, mode, cost_matrix, gap_cost, gap_open_cost, nthreads, show_progress)
118122
}
119123

R/pairwise.R

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,14 @@
1818
#' @examples
1919
#' dist_matrix(c("ACGT", "AAAA"), c("ACG", "ACGT"), mode = "global")
2020
#' @name dist_matrix
21-
dist_matrix <- function(query, target, mode, cost_matrix = NULL, gap_cost = NULL, gap_open_cost = NULL, nthreads = 1, show_progress = FALSE) {
21+
dist_matrix <- function(query, target, mode, cost_matrix = NULL, gap_cost = NA_integer_, gap_open_cost = NA_integer_, nthreads = 1, show_progress = FALSE) {
2222
charset <- unique(c(get_charset(query), get_charset(target)))
2323
check_alignment_params(mode, cost_matrix, gap_cost, gap_open_cost, charset, diag_must_be_zero = FALSE)
2424
mode <- normalize_mode_parameter(mode)
25-
# defaults for C++ plain ints
26-
if (is.null(gap_cost)) gap_cost <- 1L
27-
if (is.null(gap_open_cost)) gap_open_cost <- 0L
28-
# Align conventions with pwalign/Biostrings: first gap includes one extension
29-
if (gap_open_cost > 0L) gap_open_cost <- gap_open_cost + gap_cost
30-
c_dist_matrix(query, target, mode, cost_matrix, as.integer(gap_cost), as.integer(gap_open_cost), nthreads, show_progress)
25+
if (!is.na(gap_open_cost) && !is.na(gap_cost) && gap_open_cost > 0L) {
26+
gap_open_cost <- gap_open_cost + gap_cost
27+
}
28+
c_dist_matrix(query, target, mode, cost_matrix, gap_cost, gap_open_cost, nthreads, show_progress)
3129
}
3230

3331
#' @title Pairwise distance between two sets of sequences
@@ -50,17 +48,15 @@ dist_matrix <- function(query, target, mode, cost_matrix = NULL, gap_cost = NULL
5048
#' @examples
5149
#' dist_pairwise(c("ACGT", "AAAA"), c("ACG", "ACGT"), mode = "global")
5250
#' @name dist_pairwise
53-
dist_pairwise <- function(query, target, mode, cost_matrix = NULL, gap_cost = NULL, gap_open_cost = NULL, nthreads = 1, show_progress = FALSE) {
51+
dist_pairwise <- function(query, target, mode, cost_matrix = NULL, gap_cost = NA_integer_, gap_open_cost = NA_integer_, nthreads = 1, show_progress = FALSE) {
5452
if (length(query) != length(target)) {
5553
stop("query and target must be the same length")
5654
}
5755
charset <- unique(c(get_charset(query), get_charset(target)))
5856
check_alignment_params(mode, cost_matrix, gap_cost, gap_open_cost, charset, diag_must_be_zero = FALSE)
5957
mode <- normalize_mode_parameter(mode)
60-
# defaults for C++ plain ints
61-
if (is.null(gap_cost)) gap_cost <- 1L
62-
if (is.null(gap_open_cost)) gap_open_cost <- 0L
63-
# Align conventions with pwalign/Biostrings: first gap includes one extension
64-
if (gap_open_cost > 0L) gap_open_cost <- gap_open_cost + gap_cost
65-
c_dist_pairwise(query, target, mode, cost_matrix, as.integer(gap_cost), as.integer(gap_open_cost), nthreads, show_progress)
58+
if (!is.na(gap_open_cost) && !is.na(gap_cost) && gap_open_cost > 0L) {
59+
gap_open_cost <- gap_open_cost + gap_cost
60+
}
61+
c_dist_pairwise(query, target, mode, cost_matrix, gap_cost, gap_open_cost, nthreads, show_progress)
6662
}

R/utils.R

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,14 @@ is_integerlike <- function(x) {
5858
}
5959
}
6060

61+
is_missing_arg <- function(x) {
62+
is.null(x) || (length(x) == 1L && is.na(x))
63+
}
64+
65+
recycle_arg <- function(recycle_vector, target_vector) {
66+
rep(recycle_vector, length.out = length(target_vector))
67+
}
68+
6169
# A cost matrix must
6270
# 1) Be a square matrix _OR_ NULL
6371
# 2) If gap_cost is NULL, cost_matrix must have a "gap" entry instead
@@ -103,16 +111,16 @@ check_cost_matrix <- function(cost_matrix, gap_cost, gap_open_cost, charset, dia
103111

104112
# gap parameters are provided separately in the R APIs now
105113
# validate gap_cost / gap_open_cost here, independent of matrix contents
106-
if (!is.null(gap_cost)) {
114+
if (!is_missing_arg(gap_cost)) {
107115
if (!(is_integerlike(gap_cost) && length(gap_cost) == 1 && gap_cost > 0)) {
108116
stop("gap_cost must be a single positive integer when provided")
109117
}
110118
}
111-
if (!is.null(gap_open_cost)) {
119+
if (!is_missing_arg(gap_open_cost)) {
112120
if (!(is_integerlike(gap_open_cost) && length(gap_open_cost) == 1 && gap_open_cost >= 0)) {
113121
stop("gap_open_cost must be a single non-negative integer when provided")
114122
}
115-
if (is.null(gap_cost)) {
123+
if (is_missing_arg(gap_cost)) {
116124
stop("If gap_open_cost is defined, gap_cost must also be defined")
117125
}
118126
}
@@ -133,7 +141,7 @@ check_alignment_params <- function(mode, cost_matrix, gap_cost, gap_open_cost, c
133141
stop("mode must be one of hamming (hm), global (gb, lv, levenshtein) or anchored (an, en, extension)")
134142
}
135143
check_cost_matrix(cost_matrix, gap_cost, gap_open_cost, charset, diag_must_be_zero)
136-
if ((mode %in% c("hamming", "hm")) && (!is.null(cost_matrix) || !is.null(gap_cost) || !is.null(gap_open_cost))) {
144+
if ((mode %in% c("hamming", "hm")) && (!is.null(cost_matrix) || !is_missing_arg(gap_cost) || !is_missing_arg(gap_open_cost))) {
137145
warning("cost_matrix and gap parameters are ignored when mode is 'hamming'")
138146
}
139147
}

README.md

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -336,17 +336,6 @@ tree$prefix_search("car")
336336
## 3 car carburetor
337337
## 4 car carbuncle
338338

339-
### Why not just use Bowtie2, BWA or other fast alignment software?
340-
341-
There are no apples-to-apples comparisons. With NGS alignment software,
342-
you are looking for alignments of reads (queries) *within* a genome
343-
reference (target). Here, we’re looking for alignments from the query to
344-
the *full* target. However, many NGS aligners do use Tries and similar
345-
data structures.
346-
347-
Compared to pairwise alignment packages, calculating all alignment pairs
348-
takes much longer, but on the other hand gives you more information.
349-
350339
### References and literature
351340

352341
- “Fast string correction with Levenshtein automata” (2002)

0 commit comments

Comments
 (0)