aikenml commited on
Commit
e5f748f
Β·
1 Parent(s): 90b65ad

Upload folder using huggingface_hub

Browse files
Pytorch-Correlation-extension/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.egg*
Pytorch-Correlation-extension/Correlation_Module/correlation.cpp ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/extension.h>
2
+ using namespace torch;
3
+
4
+ #include <vector>
5
+
6
+ #define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
7
+
8
+ template <typename scalar_t>
9
+ static void correlate_patch(
10
+ TensorAccessor<scalar_t,3> input1,
11
+ TensorAccessor<scalar_t,3> input2,
12
+ scalar_t *dst,
13
+ int kH, int kW,
14
+ int dilationH, int dilationW,
15
+ int u, int v,
16
+ int shiftU, int shiftV){
17
+ const int C = input1.size(0);
18
+ const int iH = input1.size(1);
19
+ const int iW = input1.size(2);
20
+ for (int c=0; c<C; ++c){
21
+ for (int i=0; i<kH; ++i){
22
+ int i1 = u + i * dilationH;
23
+ int i2 = i1 + shiftU;
24
+ if WITHIN_BOUNDS(i1, i2, iH, iH){
25
+ for (int j=0; j<kW; ++j){
26
+ int j1 = v + j * dilationW;
27
+ int j2 = j1 + shiftV;
28
+ if WITHIN_BOUNDS(j1, j2, iW, iW){
29
+ scalar_t v1 = input1[c][i1][j1];
30
+ scalar_t v2 = input2[c][i2][j2];
31
+ *dst += v1 * v2;
32
+ }
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+
39
+ template <typename scalar_t>
40
+ static void correlate_patch_grad(
41
+ TensorAccessor<scalar_t,3> input1,
42
+ TensorAccessor<scalar_t,3> gradInput1,
43
+ TensorAccessor<scalar_t,3> input2,
44
+ TensorAccessor<scalar_t,3> gradInput2,
45
+ scalar_t gradOutput,
46
+ int kH, int kW,
47
+ int dilationH, int dilationW,
48
+ int u, int v,
49
+ int shiftU, int shiftV){
50
+
51
+ const int C = input1.size(0);
52
+ const int iH = input1.size(1);
53
+ const int iW = input1.size(2);
54
+
55
+ for (int c=0; c<C; ++c){
56
+ for (int i=0; i<kH; ++i){
57
+ int i1 = u + i * dilationH;
58
+ int i2 = i1 + shiftU;
59
+ if WITHIN_BOUNDS(i1, i2, iH, iH){
60
+ for (int j=0; j<kW; ++j){
61
+ int j1 = v + j * dilationW;
62
+ int j2 = j1 + shiftV;
63
+ if WITHIN_BOUNDS(j1, j2, iW, iW){
64
+ scalar_t v1 = input1[c][i1][j1];
65
+ scalar_t v2 = input2[c][i2][j2];
66
+ gradInput2[c][i2][j2] += gradOutput * v1;
67
+ gradInput1[c][i1][j1] += gradOutput * v2;
68
+ }
69
+ }
70
+ }
71
+ }
72
+ }
73
+ }
74
+
75
+ torch::Tensor correlation_cpp_forward(
76
+ torch::Tensor input1,
77
+ torch::Tensor input2,
78
+ int kH, int kW,
79
+ int patchH, int patchW,
80
+ int padH, int padW,
81
+ int dilationH, int dilationW,
82
+ int dilation_patchH, int dilation_patchW,
83
+ int dH, int dW) {
84
+
85
+ const auto batch_size = input1.size(0);
86
+ const auto iH = input1.size(2);
87
+ const auto iW = input1.size(3);
88
+ const int patchRadH = (patchH - 1) / 2;
89
+ const int patchRadW = (patchW - 1) / 2;
90
+ const int dilatedKH = (kH - 1) * dilationH + 1;
91
+ const int dilatedKW = (kW - 1) * dilationW + 1;
92
+
93
+ const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
94
+ const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;
95
+ auto output = at::zeros({batch_size, patchH, patchW, oH, oW}, input1.options());
96
+
97
+ int n, ph, pw, h, w;
98
+ #pragma omp parallel for private(n, ph, pw, h, w) collapse(2)
99
+ for (n = 0; n < batch_size; ++n) {
100
+ for(ph = 0; ph < patchH; ++ph){
101
+ for(pw = 0; pw < patchW; ++pw){
102
+ AT_DISPATCH_FLOATING_TYPES(input1.scalar_type(), "correlation_forward_cpp", ([&] {
103
+ auto input1_acc = input1.accessor<scalar_t, 4>();
104
+ auto input2_acc = input2.accessor<scalar_t, 4>();
105
+ auto output_acc = output.accessor<scalar_t, 5>();
106
+ for (h = 0; h < oH; ++h) {
107
+ for (w = 0; w < oW; ++w) {
108
+ correlate_patch(input1_acc[n],
109
+ input2_acc[n],
110
+ &output_acc[n][ph][pw][h][w],
111
+ kH, kW,
112
+ dilationH, dilationW,
113
+ -padH + h * dH,
114
+ -padW + w * dW,
115
+ (ph - patchRadH) * dilation_patchH,
116
+ (pw - patchRadW) * dilation_patchW);
117
+ }
118
+ }
119
+ }));
120
+ }
121
+ }
122
+ }
123
+ return output;
124
+ }
125
+
126
+ std::vector<torch::Tensor> correlation_cpp_backward(
127
+ torch::Tensor input1,
128
+ torch::Tensor input2,
129
+ torch::Tensor gradOutput,
130
+ int kH, int kW,
131
+ int patchH, int patchW,
132
+ int padH, int padW,
133
+ int dilationH, int dilationW,
134
+ int dilation_patchH, int dilation_patchW,
135
+ int dH, int dW) {
136
+
137
+ const int batch_size = input1.size(0);
138
+ const int patchRadH = (patchH - 1) / 2;
139
+ const int patchRadW = (patchW - 1) / 2;
140
+ const int oH = gradOutput.size(3);
141
+ const int oW = gradOutput.size(4);
142
+
143
+ auto gradInput1 = torch::zeros_like(input1);
144
+
145
+ auto gradInput2 = torch::zeros_like(input2);
146
+
147
+ int n, ph, pw, h, w;
148
+ #pragma omp parallel for private(n, ph, pw, h, w)
149
+ for (n = 0; n < batch_size; ++n) {
150
+ AT_DISPATCH_FLOATING_TYPES(input1.scalar_type(), "correlation_backward_cpp", ([&] {
151
+ auto input1_acc = input1.accessor<scalar_t, 4>();
152
+ auto gradInput1_acc = gradInput1.accessor<scalar_t, 4>();
153
+ auto input2_acc = input2.accessor<scalar_t, 4>();
154
+ auto gradInput2_acc = gradInput2.accessor<scalar_t, 4>();
155
+ auto gradOutput_acc = gradOutput.accessor<scalar_t, 5>();
156
+
157
+ for(ph = 0; ph < patchH; ++ph){
158
+ for(pw = 0; pw < patchW; ++pw){
159
+ for (h = 0; h < oH; ++h) {
160
+ for (w = 0; w < oW; ++w) {
161
+ correlate_patch_grad(input1_acc[n], gradInput1_acc[n],
162
+ input2_acc[n], gradInput2_acc[n],
163
+ gradOutput_acc[n][ph][pw][h][w],
164
+ kH, kW,
165
+ dilationH, dilationW,
166
+ -padH + h * dH,
167
+ -padW + w * dW,
168
+ (ph - patchRadH) * dilation_patchH,
169
+ (pw - patchRadW) * dilation_patchW);
170
+ }
171
+ }
172
+ }
173
+ }
174
+ }));
175
+ }
176
+
177
+ return {gradInput1, gradInput2};
178
+ }
Pytorch-Correlation-extension/Correlation_Module/correlation_cuda_kernel.cu ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/types.h>
2
+ using namespace torch;
3
+
4
+ #include <cuda.h>
5
+ #include <cuda_runtime.h>
6
+
7
+ #include <vector>
8
+ #include <iostream>
9
+
10
+ // Cuda tensor accessor definitions
11
+ // restrict pointer traits piroritize speed over memory consumption
12
+ #define TensorAcc4R PackedTensorAccessor32<scalar_t,4,RestrictPtrTraits>
13
+ #define TensorAcc5R PackedTensorAccessor32<scalar_t,5,RestrictPtrTraits>
14
+ #define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
15
+
16
+ #define THREADS_FORWARD 32
17
+ #define THREADS_BACKWARD 5
18
+
19
+
20
+ namespace corr {
21
+ template <typename scalar_t>
22
+ __global__ void correlation_cuda_forward_kernel(
23
+ const TensorAcc4R rInput1,
24
+ const TensorAcc4R rInput2,
25
+ TensorAcc5R output,
26
+ int kH, int kW,
27
+ int patchH, int patchW,
28
+ int padH, int padW,
29
+ int dilationH, int dilationW,
30
+ int dilation_patchH, int dilation_patchW,
31
+ int dH, int dW) {
32
+
33
+ const int iH = rInput1.size(1);
34
+ const int iW = rInput1.size(2);
35
+ const int C = rInput1.size(3);
36
+
37
+ const int n = blockIdx.x;
38
+ const int h = blockIdx.y;
39
+ const int w = blockIdx.z;
40
+ const int thread = threadIdx.x;
41
+
42
+ const int start_i = -padH + h * dH;
43
+ const int start_j = -padW + w * dW;
44
+
45
+ const int patchRadH = dilation_patchH * (patchH - 1) / 2;
46
+ const int patchRadW = dilation_patchW * (patchW - 1) / 2;
47
+
48
+ __shared__ scalar_t prod_sum[THREADS_FORWARD];
49
+
50
+ for(int ph = 0; ph < patchH; ++ph){
51
+ int ph_dilated = ph * dilation_patchH - patchRadH;
52
+ for(int pw = 0; pw < patchW; ++pw){
53
+ int pw_dilated = pw * dilation_patchW - patchRadW;
54
+ prod_sum[thread] = 0;
55
+ for (int i=0; i<kH; ++i){
56
+ int i1 = start_i + i * dilationH;
57
+ int i2 = i1 + ph_dilated;
58
+ if WITHIN_BOUNDS(i1, i2, iH, iH){
59
+ for (int j=0; j<kW; ++j){
60
+ int j1 = start_j + j * dilationW;
61
+ int j2 = j1 + pw_dilated;
62
+ if WITHIN_BOUNDS(j1, j2, iW, iW){
63
+ for (int c=thread; c<C; c += THREADS_FORWARD){
64
+ scalar_t v1 = rInput1[n][i1][j1][c];
65
+ scalar_t v2 = rInput2[n][i2][j2][c];
66
+ prod_sum[thread] += v1 * v2;
67
+ }
68
+ }
69
+ }
70
+ }
71
+ }
72
+ // accumulate
73
+ __syncthreads();
74
+ if (thread == 0) {
75
+ scalar_t reduce_sum = 0;
76
+ for (int index = 0; index < THREADS_FORWARD; ++index) {
77
+ reduce_sum += prod_sum[index];
78
+ }
79
+ output[n][ph][pw][h][w] = reduce_sum;
80
+ }
81
+ }
82
+ }
83
+ }
84
+
85
+
86
+ template <typename scalar_t>
87
+ __global__ void correlation_cuda_backward_kernel_input1(
88
+ const TensorAcc5R gradOutput,
89
+ const TensorAcc4R input2,
90
+ TensorAcc4R gradInput1,
91
+ const int kH, const int kW,
92
+ const int patchH, const int patchW,
93
+ const int padH, const int padW,
94
+ const int dilationH, const int dilationW,
95
+ const int dilation_patchH, const int dilation_patchW,
96
+ const int dH, const int dW,
97
+ const int batch) {
98
+ const int iH = input2.size(2);
99
+ const int iW = input2.size(3);
100
+
101
+ const int H = gradOutput.size(3);
102
+ const int W = gradOutput.size(4);
103
+
104
+ const int patchRadH = (patchH - 1) / 2;
105
+ const int patchRadW = (patchW - 1) / 2;
106
+
107
+ const int n = batch;
108
+ const int c = blockIdx.x;
109
+ const int h = blockIdx.y;
110
+ const int w = blockIdx.z;
111
+ const int ph_off = threadIdx.x;
112
+ const int pw_off = threadIdx.y;
113
+
114
+ const int h_2 = h + padH;
115
+ const int w_2 = w + padW;
116
+ const int min_h = h_2 - kH * dilationH;
117
+ const int min_w = w_2 - kW * dilationW;
118
+
119
+ __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
120
+ prod_sum[ph_off][pw_off] = 0;
121
+
122
+ for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
123
+ int i1 = h + dilation_patchH * (ph - patchRadH);
124
+ for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
125
+ int j1 = w + dilation_patchW * (pw - patchRadW);
126
+ if (WITHIN_BOUNDS(i1, j1, iH, iW)){
127
+ scalar_t val = input2[n][c][i1][j1];
128
+ for(int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
129
+ int i2 = (h_3)/dH;
130
+ if (i2 * dH != h_3)
131
+ continue;
132
+ for(int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
133
+ int j2 = (w_3) / dW;
134
+ if(j2 * dW != w_3)
135
+ continue;
136
+ if WITHIN_BOUNDS(i2, j2, H, W) {
137
+ prod_sum[ph_off][pw_off] += gradOutput[n][ph][pw][i2][j2] * val;
138
+ }
139
+ }
140
+ }
141
+ }
142
+ }
143
+ }
144
+
145
+ __syncthreads();
146
+
147
+ if (ph_off == 0 && pw_off == 0){
148
+ scalar_t reduce_sum =0;
149
+ for (int ph = 0; ph < THREADS_BACKWARD; ++ph){
150
+ for (int pw = 0; pw < THREADS_BACKWARD; ++pw){
151
+ reduce_sum += prod_sum[ph][pw];
152
+ }
153
+ }
154
+ gradInput1[n][c][h][w] = reduce_sum;
155
+ }
156
+ }
157
+
158
+
159
+ template <typename scalar_t>
160
+ __global__ void correlation_cuda_backward_kernel_input2(
161
+ const TensorAcc5R gradOutput,
162
+ const TensorAcc4R input1,
163
+ TensorAcc4R gradInput2,
164
+ int kH, int kW,
165
+ int patchH, int patchW,
166
+ int padH, int padW,
167
+ int dilationH, int dilationW,
168
+ int dilation_patchH, int dilation_patchW,
169
+ int dH, int dW,
170
+ int batch) {
171
+ const int iH = input1.size(2);
172
+ const int iW = input1.size(3);
173
+
174
+ const int patchRadH = (patchH - 1) / 2;
175
+ const int patchRadW = (patchW - 1) / 2;
176
+
177
+ const int H = gradOutput.size(3);
178
+ const int W = gradOutput.size(4);
179
+
180
+ const int dilatedKH = kH * dilationH;
181
+ const int dilatedKW = kW * dilationW;
182
+
183
+ const int n = batch;
184
+ const int c = blockIdx.x;
185
+ const int h = blockIdx.y;
186
+ const int w = blockIdx.z;
187
+ const int ph_off = threadIdx.x;
188
+ const int pw_off = threadIdx.y;
189
+
190
+ __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
191
+ prod_sum[ph_off][pw_off] = 0;
192
+
193
+ for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
194
+ int i1 = h - dilation_patchH * (ph - patchRadH);
195
+ for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
196
+ int j1 = w - dilation_patchW * (pw - patchRadW);
197
+ if WITHIN_BOUNDS(i1, j1, iH, iW) {
198
+ scalar_t val = input1[n][c][i1][j1];
199
+
200
+ const int h_2 = i1 + padH;
201
+ const int w_2 = j1 + padW;
202
+ const int min_h = h_2 - dilatedKH;
203
+ const int min_w = w_2 - dilatedKW;
204
+
205
+ for(int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
206
+ int i2 = (h_3)/dH;
207
+ if (i2 * dH != h_3)
208
+ continue;
209
+ for(int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
210
+ int j2 = (w_3) / dW;
211
+ if(j2 * dW != w_3)
212
+ continue;
213
+ if WITHIN_BOUNDS(i2, j2, H, W) {
214
+ prod_sum[ph_off][pw_off] += gradOutput[n][ph][pw][i2][j2] * val;
215
+ }
216
+ }
217
+ }
218
+ }
219
+ }
220
+ }
221
+
222
+ __syncthreads();
223
+
224
+ if (ph_off == 0 && pw_off == 0){
225
+ scalar_t reduce_sum =0;
226
+ for (int ph = 0; ph < THREADS_BACKWARD; ++ph){
227
+ for (int pw = 0; pw < THREADS_BACKWARD; ++pw){
228
+ reduce_sum += prod_sum[ph][pw];
229
+ }
230
+ }
231
+ gradInput2[n][c][h][w] = reduce_sum;
232
+ }
233
+ }
234
+ } // namsepace corr
235
+
236
+ torch::Tensor correlation_cuda_forward(
237
+ torch::Tensor input1,
238
+ torch::Tensor input2,
239
+ int kH, int kW,
240
+ int patchH, int patchW,
241
+ int padH, int padW,
242
+ int dilationH, int dilationW,
243
+ int dilation_patchH, int dilation_patchW,
244
+ int dH, int dW) {
245
+
246
+ const int batch_size = input1.size(0);
247
+ const int iH = input1.size(2);
248
+ const int iW = input1.size(3);
249
+ const int dilatedKH = (kH - 1) * dilationH + 1;
250
+ const int dilatedKW = (kW - 1) * dilationW + 1;
251
+
252
+ const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
253
+ const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;
254
+ auto output = torch::zeros({batch_size, patchH, patchW, oH, oW}, input1.options());
255
+
256
+ auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
257
+ auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
258
+
259
+ const int threads = THREADS_FORWARD;
260
+ const dim3 blocks(batch_size, oH, oW);
261
+
262
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.scalar_type(), "correlation_forward_cuda", ([&] {
263
+ TensorAcc4R trInput1_acc = trInput1.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
264
+ TensorAcc4R trInput2_acc = trInput2.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
265
+ TensorAcc5R output_acc = output.packed_accessor32<scalar_t,5,RestrictPtrTraits>();
266
+ corr::correlation_cuda_forward_kernel<scalar_t><<<blocks, threads>>>(
267
+ trInput1_acc, trInput2_acc, output_acc,
268
+ kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
269
+ dilation_patchH, dilation_patchW, dH, dW);
270
+ }));
271
+
272
+ return output;
273
+ }
274
+
275
+ std::vector<torch::Tensor> correlation_cuda_backward(
276
+ torch::Tensor input1,
277
+ torch::Tensor input2,
278
+ torch::Tensor gradOutput,
279
+ int kH, int kW,
280
+ int patchH, int patchW,
281
+ int padH, int padW,
282
+ int dilationH, int dilationW,
283
+ int dilation_patchH, int dilation_patchW,
284
+ int dH, int dW) {
285
+
286
+ auto gradInput1 = torch::zeros_like(input1);
287
+ auto gradInput2 = torch::zeros_like(input2);
288
+
289
+ const int batch_size = input1.size(0);
290
+ const int iH = input1.size(2);
291
+ const int iW = input1.size(3);
292
+ const int C = input1.size(1);
293
+
294
+ const dim3 blocks(C, iH, iW);
295
+ const dim3 threads(THREADS_BACKWARD, THREADS_BACKWARD);
296
+
297
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.scalar_type(), "correlation_backward_cuda", ([&] {
298
+ TensorAcc4R input1_acc = input1.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
299
+ TensorAcc4R input2_acc = input2.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
300
+ TensorAcc4R gradInput1_acc = gradInput1.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
301
+ TensorAcc4R gradInput2_acc = gradInput2.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
302
+ TensorAcc5R gradOutput_acc = gradOutput.packed_accessor32<scalar_t,5,RestrictPtrTraits>();
303
+
304
+
305
+ for (int n = 0; n < batch_size; ++n){
306
+ corr::correlation_cuda_backward_kernel_input1<scalar_t><<<blocks, threads>>>(
307
+ gradOutput_acc, input2_acc, gradInput1_acc,
308
+ kH, kW, patchH, patchW, padH, padW,
309
+ dilationH, dilationW,
310
+ dilation_patchH, dilation_patchW,
311
+ dH, dW,
312
+ n);
313
+ }
314
+
315
+ for (int n = 0; n < batch_size; ++n){
316
+ corr::correlation_cuda_backward_kernel_input2<scalar_t><<<blocks, threads>>>(
317
+ gradOutput_acc, input1_acc, gradInput2_acc,
318
+ kH, kW, patchH, patchW, padH, padW,
319
+ dilationH, dilationW,
320
+ dilation_patchH, dilation_patchW,
321
+ dH, dW,
322
+ n);
323
+ }
324
+ }));
325
+
326
+ return {gradInput1, gradInput2};
327
+ }
Pytorch-Correlation-extension/Correlation_Module/correlation_sampler.cpp ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/extension.h>
2
+ #include <c10/cuda/CUDAGuard.h>
3
+ #include <vector>
4
+ #include <iostream>
5
+
6
+ // declarations
7
+
8
+ torch::Tensor correlation_cpp_forward(
9
+ torch::Tensor input1,
10
+ torch::Tensor input2,
11
+ int kH, int kW,
12
+ int patchH, int patchW,
13
+ int padH, int padW,
14
+ int dilationH, int dilationW,
15
+ int dilation_patchH, int dilation_patchW,
16
+ int dH, int dW);
17
+
18
+ std::vector<torch::Tensor> correlation_cpp_backward(
19
+ torch::Tensor grad_output,
20
+ torch::Tensor input1,
21
+ torch::Tensor input2,
22
+ int kH, int kW,
23
+ int patchH, int patchW,
24
+ int padH, int padW,
25
+ int dilationH, int dilationW,
26
+ int dilation_patchH, int dilation_patchW,
27
+ int dH, int dW);
28
+
29
+ #ifdef USE_CUDA
30
+
31
+ #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDA tensor")
32
+ #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous")
33
+ #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
34
+ #define CHECK_SAME_DEVICE(x, y) TORCH_CHECK(x.device() == y.device(), #x " is not on same device as " #y)
35
+
36
+ torch::Tensor correlation_cuda_forward(
37
+ torch::Tensor input1,
38
+ torch::Tensor input2,
39
+ int kH, int kW,
40
+ int patchH, int patchW,
41
+ int padH, int padW,
42
+ int dilationH, int dilationW,
43
+ int dilation_patchH, int dilation_patchW,
44
+ int dH, int dW);
45
+
46
+ std::vector<torch::Tensor> correlation_cuda_backward(
47
+ torch::Tensor grad_output,
48
+ torch::Tensor input1,
49
+ torch::Tensor input2,
50
+ int kH, int kW,
51
+ int patchH, int patchW,
52
+ int padH, int padW,
53
+ int dilationH, int dilationW,
54
+ int dilation_patchH, int dilation_patchW,
55
+ int dH, int dW);
56
+
57
+ // C++ interface
58
+
59
+ torch::Tensor correlation_sample_forward(
60
+ torch::Tensor input1,
61
+ torch::Tensor input2,
62
+ int kH, int kW,
63
+ int patchH, int patchW,
64
+ int padH, int padW,
65
+ int dilationH, int dilationW,
66
+ int dilation_patchH, int dilation_patchW,
67
+ int dH, int dW) {
68
+ if (input1.device().is_cuda()){
69
+ CHECK_INPUT(input1);
70
+ CHECK_INPUT(input2);
71
+
72
+ // set device of input1 as default CUDA device
73
+ // https://pytorch.org/cppdocs/api/structc10_1_1cuda_1_1_optional_c_u_d_a_guard.html
74
+ const at::cuda::OptionalCUDAGuard guard_input1(device_of(input1));
75
+ CHECK_SAME_DEVICE(input1, input2);
76
+
77
+ return correlation_cuda_forward(input1, input2, kH, kW, patchH, patchW,
78
+ padH, padW, dilationH, dilationW,
79
+ dilation_patchH, dilation_patchW,
80
+ dH, dW);
81
+ }else{
82
+ return correlation_cpp_forward(input1, input2, kH, kW, patchH, patchW,
83
+ padH, padW, dilationH, dilationW,
84
+ dilation_patchH, dilation_patchW,
85
+ dH, dW);
86
+ }
87
+ }
88
+
89
+ std::vector<torch::Tensor> correlation_sample_backward(
90
+ torch::Tensor input1,
91
+ torch::Tensor input2,
92
+ torch::Tensor grad_output,
93
+ int kH, int kW,
94
+ int patchH, int patchW,
95
+ int padH, int padW,
96
+ int dilationH, int dilationW,
97
+ int dilation_patchH, int dilation_patchW,
98
+ int dH, int dW) {
99
+
100
+ if(grad_output.device().is_cuda()){
101
+ CHECK_INPUT(input1);
102
+ CHECK_INPUT(input2);
103
+
104
+ // set device of input1 as default CUDA device
105
+ const at::cuda::OptionalCUDAGuard guard_input1(device_of(input1));
106
+ CHECK_SAME_DEVICE(input1, input2);
107
+ CHECK_SAME_DEVICE(input1, grad_output);
108
+
109
+ return correlation_cuda_backward(input1, input2, grad_output,
110
+ kH, kW, patchH, patchW,
111
+ padH, padW,
112
+ dilationH, dilationW,
113
+ dilation_patchH, dilation_patchW,
114
+ dH, dW);
115
+ }else{
116
+ return correlation_cpp_backward(
117
+ input1, input2, grad_output,
118
+ kH, kW, patchH, patchW,
119
+ padH, padW,
120
+ dilationH, dilationW,
121
+ dilation_patchH, dilation_patchW,
122
+ dH, dW);
123
+ }
124
+ }
125
+
126
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
127
+ m.def("forward", &correlation_sample_forward, "Spatial Correlation Sampler Forward");
128
+ m.def("backward", &correlation_sample_backward, "Spatial Correlation Sampler backward");
129
+ }
130
+
131
+ #else
132
+
133
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
134
+ m.def("forward", &correlation_cpp_forward, "Spatial Correlation Sampler Forward");
135
+ m.def("backward", &correlation_cpp_backward, "Spatial Correlation Sampler backward");
136
+ }
137
+
138
+ #endif
Pytorch-Correlation-extension/Correlation_Module/spatial_correlation_sampler/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .spatial_correlation_sampler import SpatialCorrelationSampler, spatial_correlation_sample
Pytorch-Correlation-extension/Correlation_Module/spatial_correlation_sampler/spatial_correlation_sampler.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ from torch.autograd import Function
3
+ from torch.autograd.function import once_differentiable
4
+ from torch.nn.modules.utils import _pair
5
+
6
+ import spatial_correlation_sampler_backend as correlation
7
+
8
+
9
+ def spatial_correlation_sample(input1,
10
+ input2,
11
+ kernel_size=1,
12
+ patch_size=1,
13
+ stride=1,
14
+ padding=0,
15
+ dilation=1,
16
+ dilation_patch=1):
17
+ """Apply spatial correlation sampling on from input1 to input2,
18
+
19
+ Every parameter except input1 and input2 can be either single int
20
+ or a pair of int. For more information about Spatial Correlation
21
+ Sampling, see this page.
22
+ https://lmb.informatik.uni-freiburg.de/Publications/2015/DFIB15/
23
+
24
+ Args:
25
+ input1 : The first parameter.
26
+ input2 : The second parameter.
27
+ kernel_size : total size of your correlation kernel, in pixels
28
+ patch_size : total size of your patch, determining how many
29
+ different shifts will be applied
30
+ stride : stride of the spatial sampler, will modify output
31
+ height and width
32
+ padding : padding applied to input1 and input2 before applying
33
+ the correlation sampling, will modify output height and width
34
+ dilation_patch : step for every shift in patch
35
+
36
+ Returns:
37
+ Tensor: Result of correlation sampling
38
+
39
+ """
40
+ return SpatialCorrelationSamplerFunction.apply(input1, input2,
41
+ kernel_size, patch_size,
42
+ stride, padding, dilation, dilation_patch)
43
+
44
+
45
+ class SpatialCorrelationSamplerFunction(Function):
46
+
47
+ @staticmethod
48
+ def forward(ctx,
49
+ input1,
50
+ input2,
51
+ kernel_size=1,
52
+ patch_size=1,
53
+ stride=1,
54
+ padding=0,
55
+ dilation=1,
56
+ dilation_patch=1):
57
+
58
+ ctx.save_for_backward(input1, input2)
59
+ kH, kW = ctx.kernel_size = _pair(kernel_size)
60
+ patchH, patchW = ctx.patch_size = _pair(patch_size)
61
+ padH, padW = ctx.padding = _pair(padding)
62
+ dilationH, dilationW = ctx.dilation = _pair(dilation)
63
+ dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(dilation_patch)
64
+ dH, dW = ctx.stride = _pair(stride)
65
+
66
+ output = correlation.forward(input1, input2,
67
+ kH, kW, patchH, patchW,
68
+ padH, padW, dilationH, dilationW,
69
+ dilation_patchH, dilation_patchW,
70
+ dH, dW)
71
+
72
+ return output
73
+
74
+ @staticmethod
75
+ @once_differentiable
76
+ def backward(ctx, grad_output):
77
+ input1, input2 = ctx.saved_variables
78
+
79
+ kH, kW = ctx.kernel_size
80
+ patchH, patchW = ctx.patch_size
81
+ padH, padW = ctx.padding
82
+ dilationH, dilationW = ctx.dilation
83
+ dilation_patchH, dilation_patchW = ctx.dilation_patch
84
+ dH, dW = ctx.stride
85
+
86
+ grad_input1, grad_input2 = correlation.backward(input1, input2, grad_output,
87
+ kH, kW, patchH, patchW,
88
+ padH, padW, dilationH, dilationW,
89
+ dilation_patchH, dilation_patchW,
90
+ dH, dW)
91
+ return grad_input1, grad_input2, None, None, None, None, None, None
92
+
93
+
94
+ class SpatialCorrelationSampler(nn.Module):
95
+ def __init__(self, kernel_size=1, patch_size=1, stride=1, padding=0, dilation=1, dilation_patch=1):
96
+ super(SpatialCorrelationSampler, self).__init__()
97
+ self.kernel_size = kernel_size
98
+ self.patch_size = patch_size
99
+ self.stride = stride
100
+ self.padding = padding
101
+ self.dilation = dilation
102
+ self.dilation_patch = dilation_patch
103
+
104
+ def forward(self, input1, input2):
105
+ return SpatialCorrelationSamplerFunction.apply(input1, input2, self.kernel_size,
106
+ self.patch_size, self.stride,
107
+ self.padding, self.dilation, self.dilation_patch)
Pytorch-Correlation-extension/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) [year] [fullname]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Pytorch-Correlation-extension/README.md ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ [![PyPI](https://img.shields.io/pypi/v/spatial-correlation-sampler.svg)](https://pypi.org/project/spatial-correlation-sampler/)
3
+
4
+
5
+ # Pytorch Correlation module
6
+
7
+ this is a custom C++/Cuda implementation of Correlation module, used e.g. in [FlowNetC](https://arxiv.org/abs/1504.06852)
8
+
9
+ This [tutorial](http://pytorch.org/tutorials/advanced/cpp_extension.html) was used as a basis for implementation, as well as
10
+ [NVIDIA's cuda code](https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)
11
+
12
+ - Build and Install C++ and CUDA extensions by executing `python setup.py install`,
13
+ - Benchmark C++ vs. CUDA by running `python benchmark.py {cpu, cuda}`,
14
+ - Run gradient checks on the code by running `python grad_check.py --backend {cpu, cuda}`.
15
+
16
+ # Requirements
17
+
18
+ This module is expected to compile for Pytorch `2.1.0`.
19
+
20
+ Before installation please check compatibility of your GPU and CUDA (_Compute Capability_) [nvidia docs](https://developer.nvidia.com/cuda-gpus).
21
+ e.g RTX 6000 is using CC=8.9 so we are setting the environment variable to
22
+
23
+ `export TORCH_CUDA_ARCH_LIST="8.9+PTX"`
24
+
25
+ # Installation
26
+
27
+ be reminded this module requires `python3-dev` to compile C++ code, e.g. on Ubuntu run:
28
+
29
+ `apt install python3-dev`
30
+
31
+ this module is available on pip
32
+
33
+ `pip install spatial-correlation-sampler`
34
+
35
+ For a cpu-only version, you can install from source with
36
+
37
+ `python setup_cpu.py install`
38
+
39
+ # Known Problems
40
+
41
+ This module needs compatible gcc version and CUDA to be compiled.
42
+ Namely, CUDA 9.1 and below will need gcc5, while CUDA 9.2 and 10.0 will need gcc7
43
+ See [this issue](https://github.com/ClementPinard/Pytorch-Correlation-extension/issues/1) for more information
44
+
45
+ # Usage
46
+
47
+ API has a few difference with NVIDIA's module
48
+ * output is now a 5D tensor, which reflects the shifts horizontal and vertical.
49
+ ```
50
+ input (B x C x H x W) -> output (B x PatchH x PatchW x oH x oW)
51
+ ```
52
+ * Output sizes `oH` and `oW` are no longer dependant of patch size, but only of kernel size and padding
53
+ * Patch size `patch_size` is now the whole patch, and not only the radii.
54
+ * `stride1` is now `stride` and`stride2` is `dilation_patch`, which behave like dilated convolutions
55
+ * equivalent `max_displacement` is then `dilation_patch * (patch_size - 1) / 2`.
56
+ * `dilation` is a new parameter, it acts the same way as dilated convolution regarding the correlation kernel
57
+ * to get the right parameters for FlowNetC, you would have
58
+ ```
59
+ kernel_size=1
60
+ patch_size=21,
61
+ stride=1,
62
+ padding=0,
63
+ dilation=1
64
+ dilation_patch=2
65
+ ```
66
+
67
+
68
+ ## Example
69
+ ```python
70
+ import torch
71
+ from spatial_correlation_sampler import SpatialCorrelationSampler,
72
+
73
+ device = "cuda"
74
+ batch_size = 1
75
+ channel = 1
76
+ H = 10
77
+ W = 10
78
+ dtype = torch.float32
79
+
80
+ input1 = torch.randint(1, 4, (batch_size, channel, H, W), dtype=dtype, device=device, requires_grad=True)
81
+ input2 = torch.randint_like(input1, 1, 4).requires_grad_(True)
82
+
83
+ #You can either use the function or the module. Note that the module doesn't contain any parameter tensor.
84
+
85
+ #function
86
+
87
+ out = spatial_correlation_sample(input1,
88
+ input2,
89
+ kernel_size=3,
90
+ patch_size=1,
91
+ stride=2,
92
+ padding=0,
93
+ dilation=2,
94
+ dilation_patch=1)
95
+
96
+ #module
97
+
98
+ correlation_sampler = SpatialCorrelationSampler(
99
+ kernel_size=3,
100
+ patch_size=1,
101
+ stride=2,
102
+ padding=0,
103
+ dilation=2,
104
+ dilation_patch=1)
105
+ out = correlation_sampler(input1, input2)
106
+
107
+ ```
108
+
109
+ # Benchmark
110
+
111
+ * default parameters are from `benchmark.py`, FlowNetC parameters are same as use in `FlowNetC` with a batch size of 4, described in [this paper](https://arxiv.org/abs/1504.06852), implemented [here](https://github.com/lmb-freiburg/flownet2) and [here](https://github.com/NVIDIA/flownet2-pytorch/blob/master/networks/FlowNetC.py).
112
+ * Feel free to file an issue to add entries to this with your hardware !
113
+
114
+ ## CUDA Benchmark
115
+
116
+ * See [here](https://gist.github.com/ClementPinard/270e910147119831014932f67fb1b5ea) for a benchmark script working with [NVIDIA](https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)'s code, and Pytorch.
117
+ * Benchmark are launched with environment variable `CUDA_LAUNCH_BLOCKING` set to `1`.
118
+ * Only `float32` is benchmarked.
119
+ * FlowNetC correlation parameters where launched with the following command:
120
+
121
+ ```bash
122
+ CUDA_LAUNCH_BLOCKING=1 python benchmark.py --scale ms -k1 --patch 21 -s1 -p0 --patch_dilation 2 -b4 --height 48 --width 64 -c256 cuda -d float
123
+
124
+ CUDA_LAUNCH_BLOCKING=1 python NV_correlation_benchmark.py --scale ms -k1 --patch 21 -s1 -p0 --patch_dilation 2 -b4 --height 48 --width 64 -c256
125
+ ```
126
+
127
+ | implementation | Correlation parameters | device | pass | min time | avg time |
128
+ | -------------- | ---------------------- | ------- | -------- | ------------: | ------------: |
129
+ | ours | default | 980 GTX | forward | **5.745 ms** | **5.851 ms** |
130
+ | ours | default | 980 GTX | backward | 77.694 ms | 77.957 ms |
131
+ | NVIDIA | default | 980 GTX | forward | 13.779 ms | 13.853 ms |
132
+ | NVIDIA | default | 980 GTX | backward | **73.383 ms** | **73.708 ms** |
133
+ | | | | | | |
134
+ | ours | FlowNetC | 980 GTX | forward | **26.102 ms** | **26.179 ms** |
135
+ | ours | FlowNetC | 980 GTX | backward | **208.091 ms** | **208.510 ms** |
136
+ | NVIDIA | FlowNetC | 980 GTX | forward | 35.363 ms | 35.550 ms |
137
+ | NVIDIA | FlowNetC | 980 GTX | backward | 283.748 ms | 284.346 ms |
138
+
139
+ ### Notes
140
+ * The overhead of our implementation regarding `kernel_size` > 1 during backward needs some investigation, feel free to
141
+ dive in the code to improve it !
142
+ * The backward pass of NVIDIA is not entirely correct when stride1 > 1 and kernel_size > 1, because not everything
143
+ is computed, see [here](https://github.com/NVIDIA/flownet2-pytorch/blob/master/networks/correlation_package/src/correlation_cuda_kernel.cu#L120).
144
+
145
+ ## CPU Benchmark
146
+
147
+ * No other implementation is avalaible on CPU.
148
+ * It is obviously not recommended to run it on CPU if you have a GPU.
149
+
150
+ | Correlation parameters | device | pass | min time | avg time |
151
+ | ---------------------- | -------------------- | -------- | ----------: | ----------: |
152
+ | default | E5-2630 v3 @ 2.40GHz | forward | 159.616 ms | 188.727 ms |
153
+ | default | E5-2630 v3 @ 2.40GHz | backward | 282.641 ms | 294.194 ms |
154
+ | FlowNetC | E5-2630 v3 @ 2.40GHz | forward | 2.138 s | 2.144 s |
155
+ | FlowNetC | E5-2630 v3 @ 2.40GHz | backward | 7.006 s | 7.075 s |
Pytorch-Correlation-extension/benchmark.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ from __future__ import print_function
3
+
4
+ import argparse
5
+ import time
6
+
7
+ import torch
8
+ from spatial_correlation_sampler import SpatialCorrelationSampler
9
+ from tqdm import trange
10
+
11
+ TIME_SCALES = {'s': 1, 'ms': 1000, 'us': 1000000}
12
+
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument('backend', choices=['cpu', 'cuda'], default='cuda')
15
+ parser.add_argument('-b', '--batch-size', type=int, default=16)
16
+ parser.add_argument('-k', '--kernel-size', type=int, default=3)
17
+ parser.add_argument('--patch', type=int, default=3)
18
+ parser.add_argument('--patch_dilation', type=int, default=2)
19
+ parser.add_argument('-c', '--channel', type=int, default=64)
20
+ parser.add_argument('--height', type=int, default=100)
21
+ parser.add_argument('-w', '--width', type=int, default=100)
22
+ parser.add_argument('-s', '--stride', type=int, default=2)
23
+ parser.add_argument('-p', '--pad', type=int, default=1)
24
+ parser.add_argument('--scale', choices=['s', 'ms', 'us'], default='us')
25
+ parser.add_argument('-r', '--runs', type=int, default=100)
26
+ parser.add_argument('--dilation', type=int, default=2)
27
+ parser.add_argument('-d', '--dtype', choices=['half', 'float', 'double'])
28
+
29
+ args = parser.parse_args()
30
+
31
+ device = torch.device(args.backend)
32
+
33
+ if args.dtype == 'half':
34
+ dtype = torch.float16
35
+ elif args.dtype == 'float':
36
+ dtype = torch.float32
37
+ else:
38
+ dtype = torch.float64
39
+
40
+
41
+ input1 = torch.randn(args.batch_size,
42
+ args.channel,
43
+ args.height,
44
+ args.width,
45
+ dtype=dtype,
46
+ device=device,
47
+ requires_grad=True)
48
+ input2 = torch.randn_like(input1)
49
+
50
+ correlation_sampler = SpatialCorrelationSampler(
51
+ args.kernel_size,
52
+ args.patch,
53
+ args.stride,
54
+ args.pad,
55
+ args.dilation,
56
+ args.patch_dilation)
57
+
58
+ # Force CUDA initialization
59
+ output = correlation_sampler(input1, input2)
60
+ print(output.size())
61
+ output.mean().backward()
62
+ forward_min = float('inf')
63
+ forward_time = 0
64
+ backward_min = float('inf')
65
+ backward_time = 0
66
+ for _ in trange(args.runs):
67
+ correlation_sampler.zero_grad()
68
+
69
+ start = time.time()
70
+ output = correlation_sampler(input1, input2)
71
+ elapsed = time.time() - start
72
+ forward_min = min(forward_min, elapsed)
73
+ forward_time += elapsed
74
+ output = output.mean()
75
+
76
+ start = time.time()
77
+ (output.mean()).backward()
78
+ elapsed = time.time() - start
79
+ backward_min = min(backward_min, elapsed)
80
+ backward_time += elapsed
81
+
82
+ scale = TIME_SCALES[args.scale]
83
+ forward_min *= scale
84
+ backward_min *= scale
85
+ forward_average = forward_time / args.runs * scale
86
+ backward_average = backward_time / args.runs * scale
87
+
88
+ print('Forward: {0:.3f}/{1:.3f} {4} | Backward {2:.3f}/{3:.3f} {4}'.format(
89
+ forward_min, forward_average, backward_min, backward_average,
90
+ args.scale))
Pytorch-Correlation-extension/check.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import division
2
+ from __future__ import print_function
3
+
4
+ import argparse
5
+ import numpy as np
6
+ import torch
7
+
8
+ from spatial_correlation_sampler import SpatialCorrelationSampler
9
+
10
+
11
+ def check_equal(first, second, verbose):
12
+ if verbose:
13
+ print()
14
+ for i, (x, y) in enumerate(zip(first, second)):
15
+ x = x.cpu().detach().numpy()
16
+ y = y.cpu().detach().numpy()
17
+ if verbose:
18
+ print("x = {}".format(x.flatten()))
19
+ print("y = {}".format(y.flatten()))
20
+ print('-' * 80)
21
+ np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i))
22
+
23
+
24
+ def zero_grad(variables):
25
+ for variable in variables:
26
+ if variable.grad is not None: variable.grad.zero_()
27
+
28
+
29
+ def get_grads(variables):
30
+ return [var.grad.clone() for var in variables]
31
+
32
+
33
+ def check_forward(input1, input2, correlation_sampler, verbose, gpu_index=0):
34
+ device = torch.device(f"cuda:{gpu_index}")
35
+
36
+ cpu_values = correlation_sampler(input1, input2)
37
+ cuda_values = correlation_sampler(input1.to(device), input2.to(device))
38
+
39
+ print(f"Forward: CPU vs. CUDA device:{gpu_index} ... ", end='')
40
+ check_equal(cpu_values, cuda_values, verbose)
41
+ print('Ok')
42
+
43
+
44
+ def check_backward(input1, input2, correlation_sampler, verbose, gpu_index=0):
45
+ device = torch.device(f"cuda:{gpu_index}")
46
+
47
+ zero_grad([input1, input2])
48
+
49
+ cpu_values = correlation_sampler(input1, input2)
50
+ cpu_values.sum().backward()
51
+ grad_cpu = get_grads([input1, input2])
52
+
53
+ zero_grad([input1, input2])
54
+
55
+ cuda_values = correlation_sampler(input1.to(device), input2.to(device))
56
+ cuda_values.sum().backward()
57
+ grad_cuda = get_grads([input1, input2])
58
+
59
+ print(f"Backward: CPU vs. CUDA device:{gpu_index} ... ", end='')
60
+ check_equal(grad_cpu, grad_cuda, verbose)
61
+ print('Ok')
62
+
63
+
64
+ def check_multi_gpu_forward(correlation_sampler, verbose):
65
+ print("Multi-GPU forward")
66
+ total_gpus = torch.cuda.device_count()
67
+ for gpu in range(total_gpus):
68
+ check_forward(input1, input2, correlation_sampler, verbose, gpu_index=gpu)
69
+
70
+ def check_multi_gpu_backward(correlation_sampler, verbose):
71
+ print("Multi-GPU backward")
72
+ total_gpus = torch.cuda.device_count()
73
+ for gpu in range(total_gpus):
74
+ check_backward(input1, input2, correlation_sampler, verbose, gpu_index=gpu)
75
+
76
+
77
+ parser = argparse.ArgumentParser()
78
+ parser.add_argument('direction', choices=['forward', 'backward'], nargs='+')
79
+ parser.add_argument('-b', '--batch-size', type=int, default=1)
80
+ parser.add_argument('-k', '--kernel-size', type=int, default=3)
81
+ parser.add_argument('--patch', type=int, default=3)
82
+ parser.add_argument('--patch_dilation', type=int, default=2)
83
+ parser.add_argument('-c', '--channel', type=int, default=10)
84
+ parser.add_argument('--height', type=int, default=10)
85
+ parser.add_argument('-w', '--width', type=int, default=10)
86
+ parser.add_argument('-s', '--stride', type=int, default=2)
87
+ parser.add_argument('-p', '--pad', type=int, default=5)
88
+ parser.add_argument('-v', '--verbose', action='store_true', default=False)
89
+ parser.add_argument('-d', '--dilation', type=int, default=2)
90
+ args = parser.parse_args()
91
+ print(args)
92
+
93
+ assert(torch.cuda.is_available()), "no comparison to make"
94
+ input1 = torch.randn(args.batch_size,
95
+ args.channel,
96
+ args.height,
97
+ args.width).double()
98
+ input2 = torch.randn(args.batch_size,
99
+ args.channel,
100
+ args.height,
101
+ args.width).double()
102
+ input1.requires_grad = True
103
+ input2.requires_grad = True
104
+
105
+ correlation_sampler = SpatialCorrelationSampler(
106
+ args.kernel_size,
107
+ args.patch,
108
+ args.stride,
109
+ args.pad,
110
+ args.dilation,
111
+ args.patch_dilation)
112
+
113
+ if 'forward' in args.direction:
114
+ check_forward(input1, input2, correlation_sampler, args.verbose)
115
+ if torch.cuda.device_count() > 1: check_multi_gpu_forward(correlation_sampler, args.verbose)
116
+
117
+ if 'backward' in args.direction:
118
+ check_backward(input1, input2, correlation_sampler, args.verbose)
119
+ if torch.cuda.device_count() > 1: check_multi_gpu_backward(correlation_sampler, args.verbose)
Pytorch-Correlation-extension/grad_check.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ # torch.set_printoptions(precision=1, threshold=10000)
4
+ from torch.autograd import gradcheck
5
+ from spatial_correlation_sampler import SpatialCorrelationSampler
6
+
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument('backend', choices=['cpu', 'cuda'], default='cuda')
9
+ parser.add_argument('-b', '--batch-size', type=int, default=2)
10
+ parser.add_argument('-k', '--kernel-size', type=int, default=3)
11
+ parser.add_argument('--patch', type=int, default=3)
12
+ parser.add_argument('--patch_dilation', type=int, default=2)
13
+ parser.add_argument('-c', '--channel', type=int, default=2)
14
+ parser.add_argument('--height', type=int, default=10)
15
+ parser.add_argument('-w', '--width', type=int, default=10)
16
+ parser.add_argument('-s', '--stride', type=int, default=2)
17
+ parser.add_argument('-p', '--pad', type=int, default=1)
18
+ parser.add_argument('-d', '--dilation', type=int, default=2)
19
+
20
+ args = parser.parse_args()
21
+
22
+ input1 = torch.randn(args.batch_size,
23
+ args.channel,
24
+ args.height,
25
+ args.width,
26
+ dtype=torch.float64,
27
+ device=torch.device(args.backend))
28
+ input2 = torch.randn(args.batch_size,
29
+ args.channel,
30
+ args.height,
31
+ args.width,
32
+ dtype=torch.float64,
33
+ device=torch.device(args.backend))
34
+
35
+ input1.requires_grad = True
36
+ input2.requires_grad = True
37
+
38
+ correlation_sampler = SpatialCorrelationSampler(args.kernel_size,
39
+ args.patch,
40
+ args.stride,
41
+ args.pad,
42
+ args.dilation,
43
+ args.patch_dilation)
44
+
45
+
46
+ if gradcheck(correlation_sampler, [input1, input2]):
47
+ print('Ok')
Pytorch-Correlation-extension/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch>=1.0.1
2
+ numpy
Pytorch-Correlation-extension/setup.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from setuptools import setup
3
+ from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
4
+ from os.path import join
5
+
6
+ CPU_ONLY = False
7
+ project_root = 'Correlation_Module'
8
+
9
+ source_files = ['correlation.cpp', 'correlation_sampler.cpp']
10
+
11
+ cxx_args = ['-std=c++17', '-fopenmp']
12
+
13
+ def generate_nvcc_args(gpu_archs):
14
+ nvcc_args = []
15
+ for arch in gpu_archs:
16
+ nvcc_args.extend(['-gencode', f'arch=compute_{arch},code=sm_{arch}'])
17
+ return nvcc_args
18
+
19
+ gpu_arch = os.environ.get('GPU_ARCH', '').split()
20
+ nvcc_args = generate_nvcc_args(gpu_arch)
21
+
22
+ with open("README.md", "r") as fh:
23
+ long_description = fh.read()
24
+
25
+
26
+ def launch_setup():
27
+ if CPU_ONLY:
28
+ Extension = CppExtension
29
+ macro = []
30
+ else:
31
+ Extension = CUDAExtension
32
+ source_files.append('correlation_cuda_kernel.cu')
33
+ macro = [("USE_CUDA", None)]
34
+
35
+ sources = [join(project_root, file) for file in source_files]
36
+
37
+ setup(
38
+ name='spatial_correlation_sampler',
39
+ version="0.4.0",
40
+ author="ClΓ©ment Pinard",
41
+ author_email="[email protected]",
42
+ description="Correlation module for pytorch",
43
+ long_description=long_description,
44
+ long_description_content_type="text/markdown",
45
+ url="https://github.com/ClementPinard/Pytorch-Correlation-extension",
46
+ install_requires=['torch>=1.1', 'numpy'],
47
+ ext_modules=[
48
+ Extension('spatial_correlation_sampler_backend',
49
+ sources,
50
+ define_macros=macro,
51
+ extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args},
52
+ extra_link_args=['-lgomp'])
53
+ ],
54
+ package_dir={'': project_root},
55
+ packages=['spatial_correlation_sampler'],
56
+ cmdclass={
57
+ 'build_ext': BuildExtension
58
+ },
59
+ classifiers=[
60
+ "Programming Language :: Python :: 3",
61
+ "License :: OSI Approved :: MIT License",
62
+ "Operating System :: POSIX :: Linux",
63
+ "Intended Audience :: Science/Research",
64
+ "Topic :: Scientific/Engineering :: Artificial Intelligence"
65
+ ])
66
+
67
+
68
+ if __name__ == '__main__':
69
+ launch_setup()
Pytorch-Correlation-extension/setup_cpu.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import setup
2
+
3
+ setup.CPU_ONLY = True
4
+ setup.launch_setup()