Skip to content

Commit 69e7b22

Browse files
WANDY666hiworldwzj
andauthored
Optimize omni merge (#1255)
Co-authored-by: wangzaijun <wzjhelloworld@qq.com>
1 parent 40d8fdc commit 69e7b22

24 files changed

Lines changed: 1149 additions & 111 deletions

File tree

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
{
2+
"1024": {
3+
"BLOCK_SIZE_K": 32,
4+
"BLOCK_SIZE_M": 16,
5+
"BLOCK_SIZE_N": 64,
6+
"GROUP_SIZE_M": 32,
7+
"NEED_TRANS": false,
8+
"num_stages": 3,
9+
"num_warps": 4
10+
},
11+
"128": {
12+
"BLOCK_SIZE_K": 32,
13+
"BLOCK_SIZE_M": 16,
14+
"BLOCK_SIZE_N": 128,
15+
"GROUP_SIZE_M": 1,
16+
"NEED_TRANS": false,
17+
"num_stages": 3,
18+
"num_warps": 8
19+
},
20+
"16384": {
21+
"BLOCK_SIZE_K": 64,
22+
"BLOCK_SIZE_M": 64,
23+
"BLOCK_SIZE_N": 128,
24+
"GROUP_SIZE_M": 32,
25+
"NEED_TRANS": false,
26+
"num_stages": 3,
27+
"num_warps": 4
28+
},
29+
"2048": {
30+
"BLOCK_SIZE_K": 32,
31+
"BLOCK_SIZE_M": 16,
32+
"BLOCK_SIZE_N": 32,
33+
"GROUP_SIZE_M": 64,
34+
"NEED_TRANS": false,
35+
"num_stages": 3,
36+
"num_warps": 4
37+
},
38+
"256": {
39+
"BLOCK_SIZE_K": 32,
40+
"BLOCK_SIZE_M": 16,
41+
"BLOCK_SIZE_N": 128,
42+
"GROUP_SIZE_M": 1,
43+
"NEED_TRANS": false,
44+
"num_stages": 3,
45+
"num_warps": 8
46+
},
47+
"512": {
48+
"BLOCK_SIZE_K": 32,
49+
"BLOCK_SIZE_M": 16,
50+
"BLOCK_SIZE_N": 64,
51+
"GROUP_SIZE_M": 64,
52+
"NEED_TRANS": false,
53+
"num_stages": 4,
54+
"num_warps": 4
55+
},
56+
"64": {
57+
"BLOCK_SIZE_K": 32,
58+
"BLOCK_SIZE_M": 16,
59+
"BLOCK_SIZE_N": 128,
60+
"GROUP_SIZE_M": 1,
61+
"NEED_TRANS": false,
62+
"num_stages": 2,
63+
"num_warps": 4
64+
},
65+
"8": {
66+
"BLOCK_SIZE_K": 32,
67+
"BLOCK_SIZE_M": 16,
68+
"BLOCK_SIZE_N": 128,
69+
"GROUP_SIZE_M": 1,
70+
"NEED_TRANS": false,
71+
"num_stages": 3,
72+
"num_warps": 4
73+
},
74+
"800": {
75+
"BLOCK_SIZE_K": 32,
76+
"BLOCK_SIZE_M": 32,
77+
"BLOCK_SIZE_N": 64,
78+
"GROUP_SIZE_M": 32,
79+
"NEED_TRANS": false,
80+
"num_stages": 5,
81+
"num_warps": 4
82+
},
83+
"8192": {
84+
"BLOCK_SIZE_K": 64,
85+
"BLOCK_SIZE_M": 32,
86+
"BLOCK_SIZE_N": 128,
87+
"GROUP_SIZE_M": 32,
88+
"NEED_TRANS": false,
89+
"num_stages": 2,
90+
"num_warps": 4
91+
}
92+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_K": 128,
4+
"BLOCK_SIZE_M": 16,
5+
"BLOCK_SIZE_N": 32,
6+
"GROUP_SIZE_M": 16,
7+
"NEED_TRANS": false,
8+
"num_stages": 5,
9+
"num_warps": 4
10+
},
11+
"100": {
12+
"BLOCK_SIZE_K": 64,
13+
"BLOCK_SIZE_M": 32,
14+
"BLOCK_SIZE_N": 16,
15+
"GROUP_SIZE_M": 16,
16+
"NEED_TRANS": false,
17+
"num_stages": 3,
18+
"num_warps": 4
19+
},
20+
"1024": {
21+
"BLOCK_SIZE_K": 128,
22+
"BLOCK_SIZE_M": 32,
23+
"BLOCK_SIZE_N": 128,
24+
"GROUP_SIZE_M": 32,
25+
"NEED_TRANS": false,
26+
"num_stages": 2,
27+
"num_warps": 4
28+
},
29+
"128": {
30+
"BLOCK_SIZE_K": 64,
31+
"BLOCK_SIZE_M": 16,
32+
"BLOCK_SIZE_N": 32,
33+
"GROUP_SIZE_M": 1,
34+
"NEED_TRANS": false,
35+
"num_stages": 5,
36+
"num_warps": 4
37+
},
38+
"16": {
39+
"BLOCK_SIZE_K": 128,
40+
"BLOCK_SIZE_M": 16,
41+
"BLOCK_SIZE_N": 32,
42+
"GROUP_SIZE_M": 1,
43+
"NEED_TRANS": false,
44+
"num_stages": 2,
45+
"num_warps": 4
46+
},
47+
"2048": {
48+
"BLOCK_SIZE_K": 64,
49+
"BLOCK_SIZE_M": 64,
50+
"BLOCK_SIZE_N": 128,
51+
"GROUP_SIZE_M": 64,
52+
"NEED_TRANS": false,
53+
"num_stages": 3,
54+
"num_warps": 8
55+
},
56+
"256": {
57+
"BLOCK_SIZE_K": 128,
58+
"BLOCK_SIZE_M": 16,
59+
"BLOCK_SIZE_N": 32,
60+
"GROUP_SIZE_M": 1,
61+
"NEED_TRANS": false,
62+
"num_stages": 4,
63+
"num_warps": 4
64+
},
65+
"32": {
66+
"BLOCK_SIZE_K": 64,
67+
"BLOCK_SIZE_M": 16,
68+
"BLOCK_SIZE_N": 128,
69+
"GROUP_SIZE_M": 64,
70+
"NEED_TRANS": false,
71+
"num_stages": 3,
72+
"num_warps": 8
73+
},
74+
"64": {
75+
"BLOCK_SIZE_K": 64,
76+
"BLOCK_SIZE_M": 16,
77+
"BLOCK_SIZE_N": 32,
78+
"GROUP_SIZE_M": 1,
79+
"NEED_TRANS": false,
80+
"num_stages": 5,
81+
"num_warps": 4
82+
},
83+
"8": {
84+
"BLOCK_SIZE_K": 64,
85+
"BLOCK_SIZE_M": 16,
86+
"BLOCK_SIZE_N": 128,
87+
"GROUP_SIZE_M": 32,
88+
"NEED_TRANS": false,
89+
"num_stages": 5,
90+
"num_warps": 8
91+
}
92+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_K": 128,
4+
"BLOCK_SIZE_M": 16,
5+
"BLOCK_SIZE_N": 64,
6+
"GROUP_SIZE_M": 16,
7+
"NEED_TRANS": false,
8+
"num_stages": 3,
9+
"num_warps": 4
10+
},
11+
"100": {
12+
"BLOCK_SIZE_K": 128,
13+
"BLOCK_SIZE_M": 16,
14+
"BLOCK_SIZE_N": 32,
15+
"GROUP_SIZE_M": 1,
16+
"NEED_TRANS": true,
17+
"num_stages": 4,
18+
"num_warps": 4
19+
},
20+
"1024": {
21+
"BLOCK_SIZE_K": 128,
22+
"BLOCK_SIZE_M": 32,
23+
"BLOCK_SIZE_N": 128,
24+
"GROUP_SIZE_M": 1,
25+
"NEED_TRANS": true,
26+
"num_stages": 2,
27+
"num_warps": 4
28+
},
29+
"128": {
30+
"BLOCK_SIZE_K": 128,
31+
"BLOCK_SIZE_M": 16,
32+
"BLOCK_SIZE_N": 64,
33+
"GROUP_SIZE_M": 16,
34+
"NEED_TRANS": false,
35+
"num_stages": 3,
36+
"num_warps": 4
37+
},
38+
"16": {
39+
"BLOCK_SIZE_K": 128,
40+
"BLOCK_SIZE_M": 16,
41+
"BLOCK_SIZE_N": 32,
42+
"GROUP_SIZE_M": 32,
43+
"NEED_TRANS": false,
44+
"num_stages": 3,
45+
"num_warps": 4
46+
},
47+
"2048": {
48+
"BLOCK_SIZE_K": 128,
49+
"BLOCK_SIZE_M": 64,
50+
"BLOCK_SIZE_N": 64,
51+
"GROUP_SIZE_M": 16,
52+
"NEED_TRANS": true,
53+
"num_stages": 3,
54+
"num_warps": 4
55+
},
56+
"256": {
57+
"BLOCK_SIZE_K": 128,
58+
"BLOCK_SIZE_M": 16,
59+
"BLOCK_SIZE_N": 64,
60+
"GROUP_SIZE_M": 64,
61+
"NEED_TRANS": false,
62+
"num_stages": 3,
63+
"num_warps": 4
64+
},
65+
"32": {
66+
"BLOCK_SIZE_K": 128,
67+
"BLOCK_SIZE_M": 16,
68+
"BLOCK_SIZE_N": 32,
69+
"GROUP_SIZE_M": 32,
70+
"NEED_TRANS": false,
71+
"num_stages": 3,
72+
"num_warps": 4
73+
},
74+
"4096": {
75+
"BLOCK_SIZE_K": 128,
76+
"BLOCK_SIZE_M": 64,
77+
"BLOCK_SIZE_N": 128,
78+
"GROUP_SIZE_M": 16,
79+
"NEED_TRANS": false,
80+
"num_stages": 3,
81+
"num_warps": 8
82+
},
83+
"64": {
84+
"BLOCK_SIZE_K": 128,
85+
"BLOCK_SIZE_M": 16,
86+
"BLOCK_SIZE_N": 32,
87+
"GROUP_SIZE_M": 1,
88+
"NEED_TRANS": true,
89+
"num_stages": 3,
90+
"num_warps": 4
91+
},
92+
"8": {
93+
"BLOCK_SIZE_K": 128,
94+
"BLOCK_SIZE_M": 16,
95+
"BLOCK_SIZE_N": 128,
96+
"GROUP_SIZE_M": 1,
97+
"NEED_TRANS": true,
98+
"num_stages": 3,
99+
"num_warps": 8
100+
}
101+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
{
2+
"1024": {
3+
"BLOCK_SIZE_K": 64,
4+
"BLOCK_SIZE_M": 16,
5+
"BLOCK_SIZE_N": 128,
6+
"GROUP_SIZE_M": 64,
7+
"NEED_TRANS": true,
8+
"num_stages": 4,
9+
"num_warps": 8
10+
},
11+
"128": {
12+
"BLOCK_SIZE_K": 64,
13+
"BLOCK_SIZE_M": 16,
14+
"BLOCK_SIZE_N": 128,
15+
"GROUP_SIZE_M": 1,
16+
"NEED_TRANS": false,
17+
"num_stages": 3,
18+
"num_warps": 8
19+
},
20+
"16384": {
21+
"BLOCK_SIZE_K": 128,
22+
"BLOCK_SIZE_M": 64,
23+
"BLOCK_SIZE_N": 128,
24+
"GROUP_SIZE_M": 64,
25+
"NEED_TRANS": true,
26+
"num_stages": 3,
27+
"num_warps": 4
28+
},
29+
"2048": {
30+
"BLOCK_SIZE_K": 128,
31+
"BLOCK_SIZE_M": 16,
32+
"BLOCK_SIZE_N": 64,
33+
"GROUP_SIZE_M": 64,
34+
"NEED_TRANS": false,
35+
"num_stages": 3,
36+
"num_warps": 4
37+
},
38+
"256": {
39+
"BLOCK_SIZE_K": 64,
40+
"BLOCK_SIZE_M": 16,
41+
"BLOCK_SIZE_N": 128,
42+
"GROUP_SIZE_M": 64,
43+
"NEED_TRANS": false,
44+
"num_stages": 3,
45+
"num_warps": 8
46+
},
47+
"32768": {
48+
"BLOCK_SIZE_K": 128,
49+
"BLOCK_SIZE_M": 64,
50+
"BLOCK_SIZE_N": 128,
51+
"GROUP_SIZE_M": 64,
52+
"NEED_TRANS": false,
53+
"num_stages": 3,
54+
"num_warps": 8
55+
},
56+
"512": {
57+
"BLOCK_SIZE_K": 64,
58+
"BLOCK_SIZE_M": 16,
59+
"BLOCK_SIZE_N": 64,
60+
"GROUP_SIZE_M": 64,
61+
"NEED_TRANS": false,
62+
"num_stages": 4,
63+
"num_warps": 4
64+
},
65+
"64": {
66+
"BLOCK_SIZE_K": 64,
67+
"BLOCK_SIZE_M": 16,
68+
"BLOCK_SIZE_N": 128,
69+
"GROUP_SIZE_M": 1,
70+
"NEED_TRANS": false,
71+
"num_stages": 2,
72+
"num_warps": 4
73+
},
74+
"8": {
75+
"BLOCK_SIZE_K": 64,
76+
"BLOCK_SIZE_M": 16,
77+
"BLOCK_SIZE_N": 64,
78+
"GROUP_SIZE_M": 16,
79+
"NEED_TRANS": true,
80+
"num_stages": 3,
81+
"num_warps": 4
82+
},
83+
"800": {
84+
"BLOCK_SIZE_K": 64,
85+
"BLOCK_SIZE_M": 16,
86+
"BLOCK_SIZE_N": 32,
87+
"GROUP_SIZE_M": 64,
88+
"NEED_TRANS": true,
89+
"num_stages": 4,
90+
"num_warps": 4
91+
},
92+
"8192": {
93+
"BLOCK_SIZE_K": 128,
94+
"BLOCK_SIZE_M": 64,
95+
"BLOCK_SIZE_N": 64,
96+
"GROUP_SIZE_M": 32,
97+
"NEED_TRANS": false,
98+
"num_stages": 2,
99+
"num_warps": 4
100+
}
101+
}

0 commit comments

Comments
 (0)