JuliaSIMD
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Project.toml‎
Lines changed: 16 additions & 6 deletions b/‎Project.toml‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/LoopVectorization.jl‎
Lines changed: 14 additions & 17 deletions b/‎src/LoopVectorization.jl‎
Lines changed: 14 additions & 17 deletions
@@ -25,7 +25,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.5'
+          - '1.6'
           - '1' # automatically expands to the latest stable 1.x release of Julia.
         os:
           - ubuntu-latest
 
@@ -1,10 +1,13 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.12.108"
+version = "0.12.121"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+ArrayInterfaceCore = "30b0a656-2188-435a-8636-2ec0e6a096e2"
+ArrayInterfaceOffsetArrays = "015c0d05-e682-4f19-8f0a-679ce4c54826"
+ArrayInterfaceStaticArrays = "b0d46f97-bff5-4637-a19a-dd75974142cd"
 CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 CloseOpenIntervals = "fb6a15b2-703c-40df-9091-08a04967cfa9"
@@ -17,30 +20,37 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 PolyesterWeave = "1d0040c9-8b98-4ee7-8388-3f51789ca0ad"
 SIMDDualNumbers = "3cdde19b-5bb0-4aaf-8931-af3e248e098b"
+SIMDTypes = "94e857df-77ce-4151-89e5-788b33177be4"
 SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
+SnoopPrecompile = "66db9d55-30c0-4569-8b51-7e840670fc0c"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
-ArrayInterface = "3.1.32, 3.2.1, 5.0.1"
+ArrayInterface = "6"
+ArrayInterfaceCore = "0.1.5"
+ArrayInterfaceOffsetArrays = "0.1.2"
+ArrayInterfaceStaticArrays = "0.1.2"
 CPUSummary = "0.1.3 - 0.1.8, 0.1.11"
 ChainRulesCore = "1"
-CloseOpenIntervals = "0.1.2"
-DocStringExtensions = "0.8"
+CloseOpenIntervals = "0.1.10"
+DocStringExtensions = "0.8, 0.9"
 ForwardDiff = "0.9, 0.10"
 HostCPUFeatures = "0.1.3"
 IfElse = "0.1"
 LayoutPointers = "0.1.2"
 OffsetArrays = "1.4.1"
 PolyesterWeave = "0.1"
 SIMDDualNumbers = "0.1"
+SIMDTypes = "0.1"
 SLEEFPirates = "0.6.23"
+SnoopPrecompile = "1"
 SpecialFunctions = "1, 2"
-Static = "0.3.3, 0.4, 0.6"
+Static = "0.7"
 ThreadingUtilities = "0.5"
 UnPack = "1"
 VectorizationBase = "0.21.21"
-julia = "1.5"
+julia = "1.6"
@@ -361,5 +361,6 @@ Similar approaches can be taken to make kernels working with a variety of numeri
 * [Trixi.jl](https://github.com/trixi-framework/Trixi.jl)
 * [VectorizedStatistics.jl](https://github.com/JuliaSIMD/VectorizedStatistics.jl)
 * [NaNStatistics.jl](https://github.com/brenhinkeller/NaNStatistics.jl)
+* [VectorizedReduction.jl](https://github.com/andrewjradcliffe/VectorizedReduction.jl)
 
 If you're using LoopVectorization, please feel free to file a PR adding yours to the list!
@@ -1,13 +1,21 @@
 module LoopVectorization
 
-using Static: StaticInt, gt, static
-using VectorizationBase, SLEEFPirates, UnPack, OffsetArrays
+using ArrayInterfaceCore: UpTri, LoTri
+using Static: StaticInt, gt, static, Zero, One, reduce_tup
+using VectorizationBase,
+  SLEEFPirates, UnPack, OffsetArrays, ArrayInterfaceOffsetArrays, ArrayInterfaceStaticArrays
+using LayoutPointers:
+  AbstractStridedPointer, StridedPointer, StridedBitPointer, grouped_strided_pointer,
+  stridedpointer_preserve, GroupedStridedPointers
+import LayoutPointers
+
+using SIMDTypes: NativeTypes
+
 using VectorizationBase:
   mask,
   MM,
   AbstractMask,
   data,
-  grouped_strided_pointer,
   AbstractSIMD,
   vzero,
   offsetprecalc,
@@ -30,7 +38,6 @@ using VectorizationBase:
   maybestaticlast,
   gep,
   gesp,
-  NativeTypes, #llvmptr,
   vfmadd,
   vfmsub,
   vfnmadd,
@@ -51,9 +58,6 @@ using VectorizationBase:
   vmul_fast,
   relu,
   stridedpointer,
-  StridedPointer,
-  StridedBitPointer,
-  AbstractStridedPointer,
   _vload,
   _vstore!,
   reduced_add,
@@ -74,7 +78,6 @@ using VectorizationBase:
   vminimum,
   vany,
   vall,
-  unwrap,
   Unroll,
   VecUnroll,
   preserve_buffer,
@@ -98,10 +101,9 @@ using VectorizationBase:
   maybestaticsize#,zero_mask
 
 using HostCPUFeatures:
-  pick_vector_width, register_size, register_count, has_opmask_registers
+  pick_vector_width, register_size, register_count, has_opmask_registers, unwrap
 using CPUSummary: num_threads, num_cores, cache_linesize, cache_size
 
-using LayoutPointers: stridedpointer_preserve, GroupedStridedPointers
 
 using IfElse: ifelse
 
@@ -131,15 +133,10 @@ using ArrayInterface
 using ArrayInterface:
   OptionallyStaticUnitRange,
   OptionallyStaticRange,
-  Zero,
-  One,
   StaticBool,
   True,
   False,
-  reduce_tup,
   indices,
-  UpTri,
-  LoTri,
   strides,
   offsets,
   size,
@@ -234,9 +231,9 @@ loop-reordering so as to improve performance:
 LoopVectorization
 
 include("precompile.jl")
-_precompile_()
+# _precompile_()
 
-_vreduce(+, Float64[1.0])
+# _vreduce(+, Float64[1.0])
 # matmul_params(64, 32, 64)
 
 # import ChainRulesCore, ForwardDiff