Skip to content

Commit 608c66d

Browse files
authored
Merge branch 'main' into loopordersfaster
2 parents dc53c94 + 7a0ffd4 commit 608c66d

27 files changed

Lines changed: 322 additions & 2632 deletions

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ jobs:
2525
fail-fast: false
2626
matrix:
2727
version:
28-
- '1.5'
28+
- '1.6'
2929
- '1' # automatically expands to the latest stable 1.x release of Julia.
3030
os:
3131
- ubuntu-latest

Project.toml

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <elrodc@gmail.com>"]
4-
version = "0.12.108"
4+
version = "0.12.121"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
8+
ArrayInterfaceCore = "30b0a656-2188-435a-8636-2ec0e6a096e2"
9+
ArrayInterfaceOffsetArrays = "015c0d05-e682-4f19-8f0a-679ce4c54826"
10+
ArrayInterfaceStaticArrays = "b0d46f97-bff5-4637-a19a-dd75974142cd"
811
CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
912
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
1013
CloseOpenIntervals = "fb6a15b2-703c-40df-9091-08a04967cfa9"
@@ -17,30 +20,37 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1720
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
1821
PolyesterWeave = "1d0040c9-8b98-4ee7-8388-3f51789ca0ad"
1922
SIMDDualNumbers = "3cdde19b-5bb0-4aaf-8931-af3e248e098b"
23+
SIMDTypes = "94e857df-77ce-4151-89e5-788b33177be4"
2024
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
25+
SnoopPrecompile = "66db9d55-30c0-4569-8b51-7e840670fc0c"
2126
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
2227
Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
2328
ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
2429
UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
2530
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
2631

2732
[compat]
28-
ArrayInterface = "3.1.32, 3.2.1, 5.0.1"
33+
ArrayInterface = "6"
34+
ArrayInterfaceCore = "0.1.5"
35+
ArrayInterfaceOffsetArrays = "0.1.2"
36+
ArrayInterfaceStaticArrays = "0.1.2"
2937
CPUSummary = "0.1.3 - 0.1.8, 0.1.11"
3038
ChainRulesCore = "1"
31-
CloseOpenIntervals = "0.1.2"
32-
DocStringExtensions = "0.8"
39+
CloseOpenIntervals = "0.1.10"
40+
DocStringExtensions = "0.8, 0.9"
3341
ForwardDiff = "0.9, 0.10"
3442
HostCPUFeatures = "0.1.3"
3543
IfElse = "0.1"
3644
LayoutPointers = "0.1.2"
3745
OffsetArrays = "1.4.1"
3846
PolyesterWeave = "0.1"
3947
SIMDDualNumbers = "0.1"
48+
SIMDTypes = "0.1"
4049
SLEEFPirates = "0.6.23"
50+
SnoopPrecompile = "1"
4151
SpecialFunctions = "1, 2"
42-
Static = "0.3.3, 0.4, 0.6"
52+
Static = "0.7"
4353
ThreadingUtilities = "0.5"
4454
UnPack = "1"
4555
VectorizationBase = "0.21.21"
46-
julia = "1.5"
56+
julia = "1.6"

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,5 +361,6 @@ Similar approaches can be taken to make kernels working with a variety of numeri
361361
* [Trixi.jl](https://github.com/trixi-framework/Trixi.jl)
362362
* [VectorizedStatistics.jl](https://github.com/JuliaSIMD/VectorizedStatistics.jl)
363363
* [NaNStatistics.jl](https://github.com/brenhinkeller/NaNStatistics.jl)
364+
* [VectorizedReduction.jl](https://github.com/andrewjradcliffe/VectorizedReduction.jl)
364365

365366
If you're using LoopVectorization, please feel free to file a PR adding yours to the list!

src/LoopVectorization.jl

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
module LoopVectorization
22

3-
using Static: StaticInt, gt, static
4-
using VectorizationBase, SLEEFPirates, UnPack, OffsetArrays
3+
using ArrayInterfaceCore: UpTri, LoTri
4+
using Static: StaticInt, gt, static, Zero, One, reduce_tup
5+
using VectorizationBase,
6+
SLEEFPirates, UnPack, OffsetArrays, ArrayInterfaceOffsetArrays, ArrayInterfaceStaticArrays
7+
using LayoutPointers:
8+
AbstractStridedPointer, StridedPointer, StridedBitPointer, grouped_strided_pointer,
9+
stridedpointer_preserve, GroupedStridedPointers
10+
import LayoutPointers
11+
12+
using SIMDTypes: NativeTypes
13+
514
using VectorizationBase:
615
mask,
716
MM,
817
AbstractMask,
918
data,
10-
grouped_strided_pointer,
1119
AbstractSIMD,
1220
vzero,
1321
offsetprecalc,
@@ -30,7 +38,6 @@ using VectorizationBase:
3038
maybestaticlast,
3139
gep,
3240
gesp,
33-
NativeTypes, #llvmptr,
3441
vfmadd,
3542
vfmsub,
3643
vfnmadd,
@@ -51,9 +58,6 @@ using VectorizationBase:
5158
vmul_fast,
5259
relu,
5360
stridedpointer,
54-
StridedPointer,
55-
StridedBitPointer,
56-
AbstractStridedPointer,
5761
_vload,
5862
_vstore!,
5963
reduced_add,
@@ -74,7 +78,6 @@ using VectorizationBase:
7478
vminimum,
7579
vany,
7680
vall,
77-
unwrap,
7881
Unroll,
7982
VecUnroll,
8083
preserve_buffer,
@@ -98,10 +101,9 @@ using VectorizationBase:
98101
maybestaticsize#,zero_mask
99102

100103
using HostCPUFeatures:
101-
pick_vector_width, register_size, register_count, has_opmask_registers
104+
pick_vector_width, register_size, register_count, has_opmask_registers, unwrap
102105
using CPUSummary: num_threads, num_cores, cache_linesize, cache_size
103106

104-
using LayoutPointers: stridedpointer_preserve, GroupedStridedPointers
105107

106108
using IfElse: ifelse
107109

@@ -131,15 +133,10 @@ using ArrayInterface
131133
using ArrayInterface:
132134
OptionallyStaticUnitRange,
133135
OptionallyStaticRange,
134-
Zero,
135-
One,
136136
StaticBool,
137137
True,
138138
False,
139-
reduce_tup,
140139
indices,
141-
UpTri,
142-
LoTri,
143140
strides,
144141
offsets,
145142
size,
@@ -234,9 +231,9 @@ loop-reordering so as to improve performance:
234231
LoopVectorization
235232

236233
include("precompile.jl")
237-
_precompile_()
234+
# _precompile_()
238235

239-
_vreduce(+, Float64[1.0])
236+
# _vreduce(+, Float64[1.0])
240237
# matmul_params(64, 32, 64)
241238

242239
# import ChainRulesCore, ForwardDiff

0 commit comments

Comments
 (0)