@@ -2,63 +2,63 @@ library(seqtrie)
22library(stringdist )
33library(Rcpp )
44library(dplyr )
5- library(qs )
5+ # library(qs)
66library(ggplot2 )
77
88NITER <- 3
99
1010tic <- function () { .time <<- Sys.time() }
1111toc <- function () { as.numeric(Sys.time() - .time , units = " secs" ) }
1212
13- encode_source <- function (file , width = 160 ) {
14- n <- file.info(file )$ size
15- x <- readChar(con = file , nchars = n , useBytes = TRUE )
16- x <- qserialize(x , preset = " custom" , algorithm = " zstd" , compress_level = 22 )
17- x <- base91_encode(x )
18- starts <- seq(1 ,nchar(x ), by = width )
19- x <- sapply(starts , function (i ) {
20- substr(x , i , i + width - 1 )
21- })
22- x <- gsub(' \\ "' , " \' " , x )
23- dput(x )
24- }
13+ # encode_source <- function(file, width = 160) {
14+ # n <- file.info(file)$size
15+ # x <- readChar(con = file, nchars=n, useBytes = TRUE)
16+ # x <- qserialize(x, preset = "custom", algorithm = "zstd", compress_level = 22)
17+ # x <- base91_encode(x)
18+ # starts <- seq(1,nchar(x), by=width)
19+ # x <- sapply(starts, function(i) {
20+ # substr(x, i, i+width-1)
21+ # })
22+ # x <- gsub('\\"', "\'", x)
23+ # dput(x)
24+ # }
2525
26- decode_source <- function (x ) {
27- x <- paste0(x , collapse = " " )
28- x <- gsub(" \\ '" , ' \\ "' , x )
29- x <- base91_decode(x )
30- qdeserialize(x )
31- }
26+ # decode_source <- function(x) {
27+ # x <- paste0(x, collapse = "")
28+ # x <- gsub("\\'", '\\"', x)
29+ # x <- base91_decode(x)
30+ # qdeserialize(x)
31+ # }
3232
33- # Compare with original implementation -- requires C++17
34- ogsource <- c(
35- " un]'BAAA@QRtHACAAAAAAAjn6BAABdk1kux.im:AwoDf|vw(wxy(B'B'1u@cml1Qlc1,lq.{^M^W015+H<yKma_%c3=C_*MC/<fXA'k%WLq#$$FAY'/ic,@n,wG&e5K?kmD.ax^T_1bN;Dvt&$kw=+xwINU+[>+[" ,
36- " n&'q/(a,Z}8|b6`Vc_HnG8}n2@8gO;qKg[+LC#sWWFj~ojJ/`$5:)9v>Q^yOUnnpT(HZ4SG=weaYl2:4wzxel*j%G?*zR`[ZNZuv7RoHDx^d`tdM{l7mg82*xG0#u<xf~heiFHYMtZ<E,(6ja/#l<$Qg|iOaZ847" ,
37- " 8aDsFeTP|B|Lx(X@8QJP:.[^p5l<8&(rvveS8M{Jt49D*whd6RT!8HxyOwh15f{VErZaFyQ5Vf>n_PP*W*SW`G,Sd(4_Yr(nUF8w8^Qc{4H3u!+ia|df8=7xWIYku4xI3oAa*83Tu7/1L%9@`xQ#hir#^M,Hs{<_" ,
38- " Me!L4*dPtzP/mD^+Eo([?Gd0'5T6o}xP$=nf(rz_H7PZjOBW|w0*!Y$6D:[L_Vx9%bsCZz<NwVxS]gIl+Ob{1:mhyJORTNip0&!C)Dl}~<?pkeLgI%]XK+sTh%n#,l>:.Ec>W35]yU/#lg1+nnFP8H7(O=.JBz!@" ,
39- " ;eSC&}%r!r]Mcg4KM:o+E,2p*otfWnbKiy.[mLUp%e?e2N9mWbi1(@&lNU1qIJu=3pSQ1b+^+ir|&A||8+;<$KL`LOvG=(E9hlR^78AWq+VkJrzI(.dQ=jJcAb;`)3'.,REsKCwU|QMRRM8gwgDiS#ALR/W/Qnb5" ,
40- " adW[k}?61TM?N4Bm8rb'|v/u<l32(R?*55'nTCn:/88a@r@X%1Z'u4aFvtbYkQ$D>J[c$+N]{6GSbS0U8MO_?byTm;/{weP8Fa.Z(d=*8ax+3@O?dJp/~xpIU(f&$ECL$&lLSwaq1t.EmrHM:aTX[DICu*^D:Ts+" ,
41- " GEsLmu(:iG1Ez/=Nr<Z|QShD`Jai4g%n~pmB5)Z@9Qhhbc@^gIfHSZX|Du*Y$PZKEdzA/FAAKO95AApZTXn2RdpD4xXEI7<c=5+R^XCtCABtgAGANcAAY4)IT@,;;'>#OOv*1KBB[ng(U_:Y3YDVh]M{B9jpmc/Z" ,
42- " <qPOb)%.n[Zxj/Q8(dx9DS|}hReg8zA&4:Czu[n$YyGxEGU91iG:g[tcgs.EFn!H?t{<W&u0g9Qz{HUSe2Wak60YJEhj46?oehN0A,xIM*K1*m.DO`H?,u2q+g3}Df?ga`_ajCiDal5ICNF14GJHsv{a.y2ZU@{L" ,
43- " ;*KV;V'q:@=v?'$e3MbI%kd]UX*vQkcqCF'T&i`I7Mhtfx9gXE5=^Ynej~Qas_X_(&3bw8AovP&U<bXo),WQ&(Ts*'7rOJg!fVxD)BKS,f){uQ[*nF_|>rXyz<zwM;/!{0WTtUzh`3gby3!5+~xjxT?~@RvZcRXp" ,
44- " Ua2NClD/6_E?=^[DzN3O%zq9n@3&Xnt_k+2D7`.=qHgErF~NGiz~:)gQhJJz|3@6W=PZ:#mfCEp>7;xykS~IVM'(vlyKNp_<HbjkmMv:/|z]O'lmf*'GK+.L`g*yA*h}Aveavu|rjf3&(.M+Q1GHnlqx|R*l!ZV," ,
45- " ^d0]Vn,DxMV~/~LiA`.YChJCZ;<m?|sU3(/|n;rZImy:(*}+Y#=3gOwL4('BOE^B~V@6m|$y[N]^ykqYqw,@mJp@M__QRa{XFb%kE3=ho.bseCqQs1LaqY(oWZ{qG}_jVYB{dcy7>JIdQ,83[~@T=t.xh>s^)I3S" ,
46- " dki(|C|x|E3ENOpuMCkQQ`0%1uSYMBP9+|Du+_z)@P)~E05zR~atrERVGz7>G*tc4.1#{5ODYvw_l6vXt6!8xec+]qfhhKH6pZqkg<`y%?cYwG)si>XY`REyahjn*N:*l_?jkIKfO%T2ay4T>k=b,sJKS7ae~9ch" ,
47- " 0|>m/o=Mxc,`6AEz*|!|6N|DR}x87Y~9ag[3'>XUt;4$Df?!O|SWPR.`mut_&Ktb*F*5g%G!Y~>10E&gS1S}uh>=66aN#@z4#=Pl|1>VOw^+H)J!kC{N_B.^I8uZl)vS)}3ESdrM^dmPi:!t1KF>GKtIWs]'k:{1" ,
48- " oYEW$gwzt1#)IW2DqE2L+T{K@acd!%[0G.G!VF.pouqN0&J@o%4G0EJa}(sW5V3%>Tq@GuL9R!)uOvS2HWvaf'K`_G):7}(wh=]k%k%d,V#}e)pCU)kSQZ.>u}/we4f@&FB'C;z[`S?%'C%*|HTB/9L4`5,{ccQn" ,
49- " sr9eH'cjnDxC>z@hrT]BblzvVdXn?l!T)i%__BKiBXSD%N;Uh[5D,vjj|I</ItSiH`jNI$6e{.8b1<)EVYvUT6YZ<zoCus{2~O|/X!=;rUyf32jjgn[[A{;,Q;b/C`8dYTcq+Ue^/ze0<Oj6a?IVwccBz6Qax#/a" ,
50- " u>tCP8c!.qEBcC(ES.yRWNH>W`*l^z4a:flaa6;GzM@|R!soPT[B<0f?Wq:$(xDwCPz!>:Qic5{ca665am:(`[L<1v,qo+E~'(')Cvs;=ze_WkLDbL&PFns;eb]q375:QR8y.NAVE+Nb]OKnuQ1X9HDq.c#~U8_`" ,
51- " L&M:NFQTXR*)1b~X,rmfxj?~C=uewOk@fVNUS=@W$L>`fy:.q5QP|5s'B}upI$,jmh[h};CXs.XWzTn*9q>apPu*@]'vq01E+,'6]yCaK#AT^J58Kn7[`O'}OP;K@.wTB7#(90H2P:OH>}*o}b'B<ur7:FF0q43X" ,
52- " 6%pS^e+fg?frqYI+T8kj8^7[@)DV}.',Gh<;oioh<.5s&F%uIP?f=EN`:4&GIB]c6`%<4Dei%S+MI@k,vK|xGhE27::kKM?HU+'gK7Jy)_%~xV<`Q!,Y$xBZxnO$E<#Lsvz#tcDSjit:9yhBC&6idCOfjQ~fREp4" ,
53- " .T4?o9^$&h9l$A)~jxwTrFYE*YV?]F0?9&ZSeSF=uJd1wswalB~2=_E[^HlDkZAcIjwx17oci_Ud1O>4~)yavI*`Vv'HD" )
54- sourceCpp(code = decode_source(ogsource ))
55-
56- run_og <- function (query , target , max_distance , show_progress = F ) {
57- results <- og_levenshteinSearch(query , target , max_distance = max_distance )
58- results $ query <- query [results $ query + 1 ]
59- results $ target <- target [results $ target + 1 ]
60- results %> % arrange(query , target )
61- }
33+ # # Compare with original implementation -- requires C++17
34+ # ogsource <- c(
35+ # "un]'BAAA@QRtHACAAAAAAAjn6BAABdk1kux.im:AwoDf|vw(wxy(B'B'1u@cml1Qlc1,lq.{^M^W015+H<yKma_%c3=C_*MC/<fXA'k%WLq#$$FAY'/ic,@n,wG&e5K?kmD.ax^T_1bN;Dvt&$kw=+xwINU+[>+[",
36+ # "n&'q/(a,Z}8|b6`Vc_HnG8}n2@8gO;qKg[+LC#sWWFj~ojJ/`$5:)9v>Q^yOUnnpT(HZ4SG=weaYl2:4wzxel*j%G?*zR`[ZNZuv7RoHDx^d`tdM{l7mg82*xG0#u<xf~heiFHYMtZ<E,(6ja/#l<$Qg|iOaZ847",
37+ # "8aDsFeTP|B|Lx(X@8QJP:.[^p5l<8&(rvveS8M{Jt49D*whd6RT!8HxyOwh15f{VErZaFyQ5Vf>n_PP*W*SW`G,Sd(4_Yr(nUF8w8^Qc{4H3u!+ia|df8=7xWIYku4xI3oAa*83Tu7/1L%9@`xQ#hir#^M,Hs{<_",
38+ # "Me!L4*dPtzP/mD^+Eo([?Gd0'5T6o}xP$=nf(rz_H7PZjOBW|w0*!Y$6D:[L_Vx9%bsCZz<NwVxS]gIl+Ob{1:mhyJORTNip0&!C)Dl}~<?pkeLgI%]XK+sTh%n#,l>:.Ec>W35]yU/#lg1+nnFP8H7(O=.JBz!@",
39+ # ";eSC&}%r!r]Mcg4KM:o+E,2p*otfWnbKiy.[mLUp%e?e2N9mWbi1(@&lNU1qIJu=3pSQ1b+^+ir|&A||8+;<$KL`LOvG=(E9hlR^78AWq+VkJrzI(.dQ=jJcAb;`)3'.,REsKCwU|QMRRM8gwgDiS#ALR/W/Qnb5",
40+ # "adW[k}?61TM?N4Bm8rb'|v/u<l32(R?*55'nTCn:/88a@r@X%1Z'u4aFvtbYkQ$D>J[c$+N]{6GSbS0U8MO_?byTm;/{weP8Fa.Z(d=*8ax+3@O?dJp/~xpIU(f&$ECL$&lLSwaq1t.EmrHM:aTX[DICu*^D:Ts+",
41+ # "GEsLmu(:iG1Ez/=Nr<Z|QShD`Jai4g%n~pmB5)Z@9Qhhbc@^gIfHSZX|Du*Y$PZKEdzA/FAAKO95AApZTXn2RdpD4xXEI7<c=5+R^XCtCABtgAGANcAAY4)IT@,;;'>#OOv*1KBB[ng(U_:Y3YDVh]M{B9jpmc/Z",
42+ # "<qPOb)%.n[Zxj/Q8(dx9DS|}hReg8zA&4:Czu[n$YyGxEGU91iG:g[tcgs.EFn!H?t{<W&u0g9Qz{HUSe2Wak60YJEhj46?oehN0A,xIM*K1*m.DO`H?,u2q+g3}Df?ga`_ajCiDal5ICNF14GJHsv{a.y2ZU@{L",
43+ # ";*KV;V'q:@=v?'$e3MbI%kd]UX*vQkcqCF'T&i`I7Mhtfx9gXE5=^Ynej~Qas_X_(&3bw8AovP&U<bXo),WQ&(Ts*'7rOJg!fVxD)BKS,f){uQ[*nF_|>rXyz<zwM;/!{0WTtUzh`3gby3!5+~xjxT?~@RvZcRXp",
44+ # "Ua2NClD/6_E?=^[DzN3O%zq9n@3&Xnt_k+2D7`.=qHgErF~NGiz~:)gQhJJz|3@6W=PZ:#mfCEp>7;xykS~IVM'(vlyKNp_<HbjkmMv:/|z]O'lmf*'GK+.L`g*yA*h}Aveavu|rjf3&(.M+Q1GHnlqx|R*l!ZV,",
45+ # "^d0]Vn,DxMV~/~LiA`.YChJCZ;<m?|sU3(/|n;rZImy:(*}+Y#=3gOwL4('BOE^B~V@6m|$y[N]^ykqYqw,@mJp@M__QRa{XFb%kE3=ho.bseCqQs1LaqY(oWZ{qG}_jVYB{dcy7>JIdQ,83[~@T=t.xh>s^)I3S",
46+ # "dki(|C|x|E3ENOpuMCkQQ`0%1uSYMBP9+|Du+_z)@P)~E05zR~atrERVGz7>G*tc4.1#{5ODYvw_l6vXt6!8xec+]qfhhKH6pZqkg<`y%?cYwG)si>XY`REyahjn*N:*l_?jkIKfO%T2ay4T>k=b,sJKS7ae~9ch",
47+ # "0|>m/o=Mxc,`6AEz*|!|6N|DR}x87Y~9ag[3'>XUt;4$Df?!O|SWPR.`mut_&Ktb*F*5g%G!Y~>10E&gS1S}uh>=66aN#@z4#=Pl|1>VOw^+H)J!kC{N_B.^I8uZl)vS)}3ESdrM^dmPi:!t1KF>GKtIWs]'k:{1",
48+ # "oYEW$gwzt1#)IW2DqE2L+T{K@acd!%[0G.G!VF.pouqN0&J@o%4G0EJa}(sW5V3%>Tq@GuL9R!)uOvS2HWvaf'K`_G):7}(wh=]k%k%d,V#}e)pCU)kSQZ.>u}/we4f@&FB'C;z[`S?%'C%*|HTB/9L4`5,{ccQn",
49+ # "sr9eH'cjnDxC>z@hrT]BblzvVdXn?l!T)i%__BKiBXSD%N;Uh[5D,vjj|I</ItSiH`jNI$6e{.8b1<)EVYvUT6YZ<zoCus{2~O|/X!=;rUyf32jjgn[[A{;,Q;b/C`8dYTcq+Ue^/ze0<Oj6a?IVwccBz6Qax#/a",
50+ # "u>tCP8c!.qEBcC(ES.yRWNH>W`*l^z4a:flaa6;GzM@|R!soPT[B<0f?Wq:$(xDwCPz!>:Qic5{ca665am:(`[L<1v,qo+E~'(')Cvs;=ze_WkLDbL&PFns;eb]q375:QR8y.NAVE+Nb]OKnuQ1X9HDq.c#~U8_`",
51+ # "L&M:NFQTXR*)1b~X,rmfxj?~C=uewOk@fVNUS=@W$L>`fy:.q5QP|5s'B}upI$,jmh[h};CXs.XWzTn*9q>apPu*@]'vq01E+,'6]yCaK#AT^J58Kn7[`O'}OP;K@.wTB7#(90H2P:OH>}*o}b'B<ur7:FF0q43X",
52+ # "6%pS^e+fg?frqYI+T8kj8^7[@)DV}.',Gh<;oioh<.5s&F%uIP?f=EN`:4&GIB]c6`%<4Dei%S+MI@k,vK|xGhE27::kKM?HU+'gK7Jy)_%~xV<`Q!,Y$xBZxnO$E<#Lsvz#tcDSjit:9yhBC&6idCOfjQ~fREp4",
53+ # ".T4?o9^$&h9l$A)~jxwTrFYE*YV?]F0?9&ZSeSF=uJd1wswalB~2=_E[^HlDkZAcIjwx17oci_Ud1O>4~)yavI*`Vv'HD")
54+ # sourceCpp(code = decode_source(ogsource))
55+
56+ # run_og <- function(query, target, max_distance, show_progress = F) {
57+ # results <- og_levenshteinSearch(query, target, max_distance = max_distance)
58+ # results$query <- query[results$query+1]
59+ # results$target <- target[results$target+1]
60+ # results %>% arrange(query, target)
61+ # }
6262
6363# run_dnatree <- function(query, target, max_distance=NULL, max_fraction=NULL, mode = "levenshtein", show_progress = FALSE, nthreads = 8) {
6464# x <- treedist::DNATree$new()
@@ -106,8 +106,8 @@ run_stringdist <- function(query, target, max_distance=NULL, max_fraction=NULL,
106106# methods <- list(run_dnatree, run_radixtree, run_radixforest, run_prefixtree, run_stringdist, run_og)
107107# names(methods) <- c("DNATree", "RadixTree", "RadixForest", "PrefixTree", "stringdist", "OG")
108108
109- methods <- list (run_radixtree , run_radixforest , run_og )
110- names(methods ) <- c(" RadixTree" , " RadixForest" , " OG " )
109+ methods <- list (run_radixtree , run_radixforest )
110+ names(methods ) <- c(" RadixTree" , " RadixForest" )
111111
112112# data("covid_cdr3")
113113cc3_subset <- sample(covid_cdr3 , size = 1000 )
0 commit comments