11module LoopVectorization
22
3- using Static: StaticInt, gt, static
4- using VectorizationBase, SLEEFPirates, UnPack, OffsetArrays
3+ using ArrayInterfaceCore: UpTri, LoTri
4+ using Static: StaticInt, gt, static, Zero, One, reduce_tup
5+ using VectorizationBase,
6+ SLEEFPirates, UnPack, OffsetArrays, ArrayInterfaceOffsetArrays, ArrayInterfaceStaticArrays
7+ using LayoutPointers:
8+ AbstractStridedPointer, StridedPointer, StridedBitPointer, grouped_strided_pointer,
9+ stridedpointer_preserve, GroupedStridedPointers
10+ import LayoutPointers
11+
12+ using SIMDTypes: NativeTypes
13+
514using VectorizationBase:
615 mask,
716 MM,
817 AbstractMask,
918 data,
10- grouped_strided_pointer,
1119 AbstractSIMD,
1220 vzero,
1321 offsetprecalc,
@@ -30,7 +38,6 @@ using VectorizationBase:
3038 maybestaticlast,
3139 gep,
3240 gesp,
33- NativeTypes, # llvmptr,
3441 vfmadd,
3542 vfmsub,
3643 vfnmadd,
@@ -51,9 +58,6 @@ using VectorizationBase:
5158 vmul_fast,
5259 relu,
5360 stridedpointer,
54- StridedPointer,
55- StridedBitPointer,
56- AbstractStridedPointer,
5761 _vload,
5862 _vstore!,
5963 reduced_add,
@@ -74,7 +78,6 @@ using VectorizationBase:
7478 vminimum,
7579 vany,
7680 vall,
77- unwrap,
7881 Unroll,
7982 VecUnroll,
8083 preserve_buffer,
@@ -98,10 +101,9 @@ using VectorizationBase:
98101 maybestaticsize# ,zero_mask
99102
100103using HostCPUFeatures:
101- pick_vector_width, register_size, register_count, has_opmask_registers
104+ pick_vector_width, register_size, register_count, has_opmask_registers, unwrap
102105using CPUSummary: num_threads, num_cores, cache_linesize, cache_size
103106
104- using LayoutPointers: stridedpointer_preserve, GroupedStridedPointers
105107
106108using IfElse: ifelse
107109
@@ -131,15 +133,10 @@ using ArrayInterface
131133using ArrayInterface:
132134 OptionallyStaticUnitRange,
133135 OptionallyStaticRange,
134- Zero,
135- One,
136136 StaticBool,
137137 True,
138138 False,
139- reduce_tup,
140139 indices,
141- UpTri,
142- LoTri,
143140 strides,
144141 offsets,
145142 size,
@@ -234,9 +231,9 @@ loop-reordering so as to improve performance:
234231LoopVectorization
235232
236233include (" precompile.jl" )
237- _precompile_ ()
234+ # _precompile_()
238235
239- _vreduce (+ , Float64[1.0 ])
236+ # _vreduce(+, Float64[1.0])
240237# matmul_params(64, 32, 64)
241238
242239# import ChainRulesCore, ForwardDiff
0 commit comments