Skip to content

Commit aed76da

Browse files
committed
0.3.1
1 parent 49b42a7 commit aed76da

7 files changed

Lines changed: 201 additions & 110 deletions

File tree

.Rbuildignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ aclocal.m4
1212
Makefile
1313
^.vscode
1414
^.github
15-
^inst/extra_tests
15+
^inst/extra_tests
16+
Rplots.pdf

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@ autom4te.cache/*
2424
.vscode
2525
/src/.vscode
2626
.lintr
27+
Rplots.pdf

ChangeLog

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
Version 0.3.1 (2025-10-03)
2+
* (Performance) Reworked trie alignment searches to reuse dynamic-programming workspaces
3+
14
Version 0.3.0 (2025-09-05)
25
* Code cleanup, documentation cleanup, cleaner interface
36
* Minor bug fixes

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: seqtrie
22
Title: Radix Tree and Trie-Based String Distances
3-
Version: 0.3.0
4-
Date: 2025-09-05
3+
Version: 0.3.1
4+
Date: 2025-10-03
55
Authors@R: c(
66
person("Travers", "Ching", email = "traversc@gmail.com", role = c("aut", "cre", "cph")),
77
person("Martin", "Moene", role = c("ctb", "cph"), comment = "span-lite C++ library"),

Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,11 @@ test:
7575
IS_LOCAL=Yes Rscript tests/test_RadixTree.R && unset IS_LOCAL
7676
IS_LOCAL=Yes Rscript tests/test_RadixForest.R && unset IS_LOCAL
7777

78-
local-bench:
78+
bench:
7979
Rscript inst/extra_tests/benchmark.R
8080

81+
simple-bench:
82+
Rscript inst/extra_tests/simple_benchmark.R
8183

8284
R_INCLUDE=$(shell R CMD config --cppflags)
8385
Rcpp_INCLUDE=$(shell Rscript -e 'cat(system.file("include", package = "Rcpp"))')

inst/extra_tests/benchmark.R

Lines changed: 50 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -2,63 +2,63 @@ library(seqtrie)
22
library(stringdist)
33
library(Rcpp)
44
library(dplyr)
5-
library(qs)
5+
# library(qs)
66
library(ggplot2)
77

88
NITER <- 3
99

1010
tic <- function() { .time <<- Sys.time() }
1111
toc <- function() { as.numeric(Sys.time() - .time, units = "secs") }
1212

13-
encode_source <- function(file, width = 160) {
14-
n <- file.info(file)$size
15-
x <- readChar(con = file, nchars=n, useBytes = TRUE)
16-
x <- qserialize(x, preset = "custom", algorithm = "zstd", compress_level = 22)
17-
x <- base91_encode(x)
18-
starts <- seq(1,nchar(x), by=width)
19-
x <- sapply(starts, function(i) {
20-
substr(x, i, i+width-1)
21-
})
22-
x <- gsub('\\"', "\'", x)
23-
dput(x)
24-
}
13+
# encode_source <- function(file, width = 160) {
14+
# n <- file.info(file)$size
15+
# x <- readChar(con = file, nchars=n, useBytes = TRUE)
16+
# x <- qserialize(x, preset = "custom", algorithm = "zstd", compress_level = 22)
17+
# x <- base91_encode(x)
18+
# starts <- seq(1,nchar(x), by=width)
19+
# x <- sapply(starts, function(i) {
20+
# substr(x, i, i+width-1)
21+
# })
22+
# x <- gsub('\\"', "\'", x)
23+
# dput(x)
24+
# }
2525

26-
decode_source <- function(x) {
27-
x <- paste0(x, collapse = "")
28-
x <- gsub("\\'", '\\"', x)
29-
x <- base91_decode(x)
30-
qdeserialize(x)
31-
}
26+
# decode_source <- function(x) {
27+
# x <- paste0(x, collapse = "")
28+
# x <- gsub("\\'", '\\"', x)
29+
# x <- base91_decode(x)
30+
# qdeserialize(x)
31+
# }
3232

33-
# Compare with original implementation -- requires C++17
34-
ogsource <- c(
35-
"un]'BAAA@QRtHACAAAAAAAjn6BAABdk1kux.im:AwoDf|vw(wxy(B'B'1u@cml1Qlc1,lq.{^M^W015+H<yKma_%c3=C_*MC/<fXA'k%WLq#$$FAY'/ic,@n,wG&e5K?kmD.ax^T_1bN;Dvt&$kw=+xwINU+[>+[",
36-
"n&'q/(a,Z}8|b6`Vc_HnG8}n2@8gO;qKg[+LC#sWWFj~ojJ/`$5:)9v>Q^yOUnnpT(HZ4SG=weaYl2:4wzxel*j%G?*zR`[ZNZuv7RoHDx^d`tdM{l7mg82*xG0#u<xf~heiFHYMtZ<E,(6ja/#l<$Qg|iOaZ847",
37-
"8aDsFeTP|B|Lx(X@8QJP:.[^p5l<8&(rvveS8M{Jt49D*whd6RT!8HxyOwh15f{VErZaFyQ5Vf>n_PP*W*SW`G,Sd(4_Yr(nUF8w8^Qc{4H3u!+ia|df8=7xWIYku4xI3oAa*83Tu7/1L%9@`xQ#hir#^M,Hs{<_",
38-
"Me!L4*dPtzP/mD^+Eo([?Gd0'5T6o}xP$=nf(rz_H7PZjOBW|w0*!Y$6D:[L_Vx9%bsCZz<NwVxS]gIl+Ob{1:mhyJORTNip0&!C)Dl}~<?pkeLgI%]XK+sTh%n#,l>:.Ec>W35]yU/#lg1+nnFP8H7(O=.JBz!@",
39-
";eSC&}%r!r]Mcg4KM:o+E,2p*otfWnbKiy.[mLUp%e?e2N9mWbi1(@&lNU1qIJu=3pSQ1b+^+ir|&A||8+;<$KL`LOvG=(E9hlR^78AWq+VkJrzI(.dQ=jJcAb;`)3'.,REsKCwU|QMRRM8gwgDiS#ALR/W/Qnb5",
40-
"adW[k}?61TM?N4Bm8rb'|v/u<l32(R?*55'nTCn:/88a@r@X%1Z'u4aFvtbYkQ$D>J[c$+N]{6GSbS0U8MO_?byTm;/{weP8Fa.Z(d=*8ax+3@O?dJp/~xpIU(f&$ECL$&lLSwaq1t.EmrHM:aTX[DICu*^D:Ts+",
41-
"GEsLmu(:iG1Ez/=Nr<Z|QShD`Jai4g%n~pmB5)Z@9Qhhbc@^gIfHSZX|Du*Y$PZKEdzA/FAAKO95AApZTXn2RdpD4xXEI7<c=5+R^XCtCABtgAGANcAAY4)IT@,;;'>#OOv*1KBB[ng(U_:Y3YDVh]M{B9jpmc/Z",
42-
"<qPOb)%.n[Zxj/Q8(dx9DS|}hReg8zA&4:Czu[n$YyGxEGU91iG:g[tcgs.EFn!H?t{<W&u0g9Qz{HUSe2Wak60YJEhj46?oehN0A,xIM*K1*m.DO`H?,u2q+g3}Df?ga`_ajCiDal5ICNF14GJHsv{a.y2ZU@{L",
43-
";*KV;V'q:@=v?'$e3MbI%kd]UX*vQkcqCF'T&i`I7Mhtfx9gXE5=^Ynej~Qas_X_(&3bw8AovP&U<bXo),WQ&(Ts*'7rOJg!fVxD)BKS,f){uQ[*nF_|>rXyz<zwM;/!{0WTtUzh`3gby3!5+~xjxT?~@RvZcRXp",
44-
"Ua2NClD/6_E?=^[DzN3O%zq9n@3&Xnt_k+2D7`.=qHgErF~NGiz~:)gQhJJz|3@6W=PZ:#mfCEp>7;xykS~IVM'(vlyKNp_<HbjkmMv:/|z]O'lmf*'GK+.L`g*yA*h}Aveavu|rjf3&(.M+Q1GHnlqx|R*l!ZV,",
45-
"^d0]Vn,DxMV~/~LiA`.YChJCZ;<m?|sU3(/|n;rZImy:(*}+Y#=3gOwL4('BOE^B~V@6m|$y[N]^ykqYqw,@mJp@M__QRa{XFb%kE3=ho.bseCqQs1LaqY(oWZ{qG}_jVYB{dcy7>JIdQ,83[~@T=t.xh>s^)I3S",
46-
"dki(|C|x|E3ENOpuMCkQQ`0%1uSYMBP9+|Du+_z)@P)~E05zR~atrERVGz7>G*tc4.1#{5ODYvw_l6vXt6!8xec+]qfhhKH6pZqkg<`y%?cYwG)si>XY`REyahjn*N:*l_?jkIKfO%T2ay4T>k=b,sJKS7ae~9ch",
47-
"0|>m/o=Mxc,`6AEz*|!|6N|DR}x87Y~9ag[3'>XUt;4$Df?!O|SWPR.`mut_&Ktb*F*5g%G!Y~>10E&gS1S}uh>=66aN#@z4#=Pl|1>VOw^+H)J!kC{N_B.^I8uZl)vS)}3ESdrM^dmPi:!t1KF>GKtIWs]'k:{1",
48-
"oYEW$gwzt1#)IW2DqE2L+T{K@acd!%[0G.G!VF.pouqN0&J@o%4G0EJa}(sW5V3%>Tq@GuL9R!)uOvS2HWvaf'K`_G):7}(wh=]k%k%d,V#}e)pCU)kSQZ.>u}/we4f@&FB'C;z[`S?%'C%*|HTB/9L4`5,{ccQn",
49-
"sr9eH'cjnDxC>z@hrT]BblzvVdXn?l!T)i%__BKiBXSD%N;Uh[5D,vjj|I</ItSiH`jNI$6e{.8b1<)EVYvUT6YZ<zoCus{2~O|/X!=;rUyf32jjgn[[A{;,Q;b/C`8dYTcq+Ue^/ze0<Oj6a?IVwccBz6Qax#/a",
50-
"u>tCP8c!.qEBcC(ES.yRWNH>W`*l^z4a:flaa6;GzM@|R!soPT[B<0f?Wq:$(xDwCPz!>:Qic5{ca665am:(`[L<1v,qo+E~'(')Cvs;=ze_WkLDbL&PFns;eb]q375:QR8y.NAVE+Nb]OKnuQ1X9HDq.c#~U8_`",
51-
"L&M:NFQTXR*)1b~X,rmfxj?~C=uewOk@fVNUS=@W$L>`fy:.q5QP|5s'B}upI$,jmh[h};CXs.XWzTn*9q>apPu*@]'vq01E+,'6]yCaK#AT^J58Kn7[`O'}OP;K@.wTB7#(90H2P:OH>}*o}b'B<ur7:FF0q43X",
52-
"6%pS^e+fg?frqYI+T8kj8^7[@)DV}.',Gh<;oioh<.5s&F%uIP?f=EN`:4&GIB]c6`%<4Dei%S+MI@k,vK|xGhE27::kKM?HU+'gK7Jy)_%~xV<`Q!,Y$xBZxnO$E<#Lsvz#tcDSjit:9yhBC&6idCOfjQ~fREp4",
53-
".T4?o9^$&h9l$A)~jxwTrFYE*YV?]F0?9&ZSeSF=uJd1wswalB~2=_E[^HlDkZAcIjwx17oci_Ud1O>4~)yavI*`Vv'HD")
54-
sourceCpp(code = decode_source(ogsource))
55-
56-
run_og <- function(query, target, max_distance, show_progress = F) {
57-
results <- og_levenshteinSearch(query, target, max_distance = max_distance)
58-
results$query <- query[results$query+1]
59-
results$target <- target[results$target+1]
60-
results %>% arrange(query, target)
61-
}
33+
# # Compare with original implementation -- requires C++17
34+
# ogsource <- c(
35+
# "un]'BAAA@QRtHACAAAAAAAjn6BAABdk1kux.im:AwoDf|vw(wxy(B'B'1u@cml1Qlc1,lq.{^M^W015+H<yKma_%c3=C_*MC/<fXA'k%WLq#$$FAY'/ic,@n,wG&e5K?kmD.ax^T_1bN;Dvt&$kw=+xwINU+[>+[",
36+
# "n&'q/(a,Z}8|b6`Vc_HnG8}n2@8gO;qKg[+LC#sWWFj~ojJ/`$5:)9v>Q^yOUnnpT(HZ4SG=weaYl2:4wzxel*j%G?*zR`[ZNZuv7RoHDx^d`tdM{l7mg82*xG0#u<xf~heiFHYMtZ<E,(6ja/#l<$Qg|iOaZ847",
37+
# "8aDsFeTP|B|Lx(X@8QJP:.[^p5l<8&(rvveS8M{Jt49D*whd6RT!8HxyOwh15f{VErZaFyQ5Vf>n_PP*W*SW`G,Sd(4_Yr(nUF8w8^Qc{4H3u!+ia|df8=7xWIYku4xI3oAa*83Tu7/1L%9@`xQ#hir#^M,Hs{<_",
38+
# "Me!L4*dPtzP/mD^+Eo([?Gd0'5T6o}xP$=nf(rz_H7PZjOBW|w0*!Y$6D:[L_Vx9%bsCZz<NwVxS]gIl+Ob{1:mhyJORTNip0&!C)Dl}~<?pkeLgI%]XK+sTh%n#,l>:.Ec>W35]yU/#lg1+nnFP8H7(O=.JBz!@",
39+
# ";eSC&}%r!r]Mcg4KM:o+E,2p*otfWnbKiy.[mLUp%e?e2N9mWbi1(@&lNU1qIJu=3pSQ1b+^+ir|&A||8+;<$KL`LOvG=(E9hlR^78AWq+VkJrzI(.dQ=jJcAb;`)3'.,REsKCwU|QMRRM8gwgDiS#ALR/W/Qnb5",
40+
# "adW[k}?61TM?N4Bm8rb'|v/u<l32(R?*55'nTCn:/88a@r@X%1Z'u4aFvtbYkQ$D>J[c$+N]{6GSbS0U8MO_?byTm;/{weP8Fa.Z(d=*8ax+3@O?dJp/~xpIU(f&$ECL$&lLSwaq1t.EmrHM:aTX[DICu*^D:Ts+",
41+
# "GEsLmu(:iG1Ez/=Nr<Z|QShD`Jai4g%n~pmB5)Z@9Qhhbc@^gIfHSZX|Du*Y$PZKEdzA/FAAKO95AApZTXn2RdpD4xXEI7<c=5+R^XCtCABtgAGANcAAY4)IT@,;;'>#OOv*1KBB[ng(U_:Y3YDVh]M{B9jpmc/Z",
42+
# "<qPOb)%.n[Zxj/Q8(dx9DS|}hReg8zA&4:Czu[n$YyGxEGU91iG:g[tcgs.EFn!H?t{<W&u0g9Qz{HUSe2Wak60YJEhj46?oehN0A,xIM*K1*m.DO`H?,u2q+g3}Df?ga`_ajCiDal5ICNF14GJHsv{a.y2ZU@{L",
43+
# ";*KV;V'q:@=v?'$e3MbI%kd]UX*vQkcqCF'T&i`I7Mhtfx9gXE5=^Ynej~Qas_X_(&3bw8AovP&U<bXo),WQ&(Ts*'7rOJg!fVxD)BKS,f){uQ[*nF_|>rXyz<zwM;/!{0WTtUzh`3gby3!5+~xjxT?~@RvZcRXp",
44+
# "Ua2NClD/6_E?=^[DzN3O%zq9n@3&Xnt_k+2D7`.=qHgErF~NGiz~:)gQhJJz|3@6W=PZ:#mfCEp>7;xykS~IVM'(vlyKNp_<HbjkmMv:/|z]O'lmf*'GK+.L`g*yA*h}Aveavu|rjf3&(.M+Q1GHnlqx|R*l!ZV,",
45+
# "^d0]Vn,DxMV~/~LiA`.YChJCZ;<m?|sU3(/|n;rZImy:(*}+Y#=3gOwL4('BOE^B~V@6m|$y[N]^ykqYqw,@mJp@M__QRa{XFb%kE3=ho.bseCqQs1LaqY(oWZ{qG}_jVYB{dcy7>JIdQ,83[~@T=t.xh>s^)I3S",
46+
# "dki(|C|x|E3ENOpuMCkQQ`0%1uSYMBP9+|Du+_z)@P)~E05zR~atrERVGz7>G*tc4.1#{5ODYvw_l6vXt6!8xec+]qfhhKH6pZqkg<`y%?cYwG)si>XY`REyahjn*N:*l_?jkIKfO%T2ay4T>k=b,sJKS7ae~9ch",
47+
# "0|>m/o=Mxc,`6AEz*|!|6N|DR}x87Y~9ag[3'>XUt;4$Df?!O|SWPR.`mut_&Ktb*F*5g%G!Y~>10E&gS1S}uh>=66aN#@z4#=Pl|1>VOw^+H)J!kC{N_B.^I8uZl)vS)}3ESdrM^dmPi:!t1KF>GKtIWs]'k:{1",
48+
# "oYEW$gwzt1#)IW2DqE2L+T{K@acd!%[0G.G!VF.pouqN0&J@o%4G0EJa}(sW5V3%>Tq@GuL9R!)uOvS2HWvaf'K`_G):7}(wh=]k%k%d,V#}e)pCU)kSQZ.>u}/we4f@&FB'C;z[`S?%'C%*|HTB/9L4`5,{ccQn",
49+
# "sr9eH'cjnDxC>z@hrT]BblzvVdXn?l!T)i%__BKiBXSD%N;Uh[5D,vjj|I</ItSiH`jNI$6e{.8b1<)EVYvUT6YZ<zoCus{2~O|/X!=;rUyf32jjgn[[A{;,Q;b/C`8dYTcq+Ue^/ze0<Oj6a?IVwccBz6Qax#/a",
50+
# "u>tCP8c!.qEBcC(ES.yRWNH>W`*l^z4a:flaa6;GzM@|R!soPT[B<0f?Wq:$(xDwCPz!>:Qic5{ca665am:(`[L<1v,qo+E~'(')Cvs;=ze_WkLDbL&PFns;eb]q375:QR8y.NAVE+Nb]OKnuQ1X9HDq.c#~U8_`",
51+
# "L&M:NFQTXR*)1b~X,rmfxj?~C=uewOk@fVNUS=@W$L>`fy:.q5QP|5s'B}upI$,jmh[h};CXs.XWzTn*9q>apPu*@]'vq01E+,'6]yCaK#AT^J58Kn7[`O'}OP;K@.wTB7#(90H2P:OH>}*o}b'B<ur7:FF0q43X",
52+
# "6%pS^e+fg?frqYI+T8kj8^7[@)DV}.',Gh<;oioh<.5s&F%uIP?f=EN`:4&GIB]c6`%<4Dei%S+MI@k,vK|xGhE27::kKM?HU+'gK7Jy)_%~xV<`Q!,Y$xBZxnO$E<#Lsvz#tcDSjit:9yhBC&6idCOfjQ~fREp4",
53+
# ".T4?o9^$&h9l$A)~jxwTrFYE*YV?]F0?9&ZSeSF=uJd1wswalB~2=_E[^HlDkZAcIjwx17oci_Ud1O>4~)yavI*`Vv'HD")
54+
# sourceCpp(code = decode_source(ogsource))
55+
56+
# run_og <- function(query, target, max_distance, show_progress = F) {
57+
# results <- og_levenshteinSearch(query, target, max_distance = max_distance)
58+
# results$query <- query[results$query+1]
59+
# results$target <- target[results$target+1]
60+
# results %>% arrange(query, target)
61+
# }
6262

6363
# run_dnatree <- function(query, target, max_distance=NULL, max_fraction=NULL, mode = "levenshtein", show_progress = FALSE, nthreads = 8) {
6464
# x <- treedist::DNATree$new()
@@ -106,8 +106,8 @@ run_stringdist <- function(query, target, max_distance=NULL, max_fraction=NULL,
106106
# methods <- list(run_dnatree, run_radixtree, run_radixforest, run_prefixtree, run_stringdist, run_og)
107107
# names(methods) <- c("DNATree", "RadixTree", "RadixForest", "PrefixTree", "stringdist", "OG")
108108

109-
methods <- list(run_radixtree, run_radixforest, run_og)
110-
names(methods) <- c("RadixTree", "RadixForest", "OG")
109+
methods <- list(run_radixtree, run_radixforest)
110+
names(methods) <- c("RadixTree", "RadixForest")
111111

112112
# data("covid_cdr3")
113113
cc3_subset <- sample(covid_cdr3, size = 1000)

0 commit comments

Comments
 (0)