Upload folder using huggingface_hub
Browse files- Pytorch-Correlation-extension/.gitignore +1 -0
- Pytorch-Correlation-extension/Correlation_Module/correlation.cpp +178 -0
- Pytorch-Correlation-extension/Correlation_Module/correlation_cuda_kernel.cu +327 -0
- Pytorch-Correlation-extension/Correlation_Module/correlation_sampler.cpp +138 -0
- Pytorch-Correlation-extension/Correlation_Module/spatial_correlation_sampler/__init__.py +1 -0
- Pytorch-Correlation-extension/Correlation_Module/spatial_correlation_sampler/spatial_correlation_sampler.py +107 -0
- Pytorch-Correlation-extension/LICENSE +21 -0
- Pytorch-Correlation-extension/README.md +155 -0
- Pytorch-Correlation-extension/benchmark.py +90 -0
- Pytorch-Correlation-extension/check.py +119 -0
- Pytorch-Correlation-extension/grad_check.py +47 -0
- Pytorch-Correlation-extension/requirements.txt +2 -0
- Pytorch-Correlation-extension/setup.py +69 -0
- Pytorch-Correlation-extension/setup_cpu.py +4 -0
Pytorch-Correlation-extension/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.egg*
|
Pytorch-Correlation-extension/Correlation_Module/correlation.cpp
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <torch/extension.h>
|
2 |
+
using namespace torch;
|
3 |
+
|
4 |
+
#include <vector>
|
5 |
+
|
6 |
+
#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
|
7 |
+
|
8 |
+
template <typename scalar_t>
|
9 |
+
static void correlate_patch(
|
10 |
+
TensorAccessor<scalar_t,3> input1,
|
11 |
+
TensorAccessor<scalar_t,3> input2,
|
12 |
+
scalar_t *dst,
|
13 |
+
int kH, int kW,
|
14 |
+
int dilationH, int dilationW,
|
15 |
+
int u, int v,
|
16 |
+
int shiftU, int shiftV){
|
17 |
+
const int C = input1.size(0);
|
18 |
+
const int iH = input1.size(1);
|
19 |
+
const int iW = input1.size(2);
|
20 |
+
for (int c=0; c<C; ++c){
|
21 |
+
for (int i=0; i<kH; ++i){
|
22 |
+
int i1 = u + i * dilationH;
|
23 |
+
int i2 = i1 + shiftU;
|
24 |
+
if WITHIN_BOUNDS(i1, i2, iH, iH){
|
25 |
+
for (int j=0; j<kW; ++j){
|
26 |
+
int j1 = v + j * dilationW;
|
27 |
+
int j2 = j1 + shiftV;
|
28 |
+
if WITHIN_BOUNDS(j1, j2, iW, iW){
|
29 |
+
scalar_t v1 = input1[c][i1][j1];
|
30 |
+
scalar_t v2 = input2[c][i2][j2];
|
31 |
+
*dst += v1 * v2;
|
32 |
+
}
|
33 |
+
}
|
34 |
+
}
|
35 |
+
}
|
36 |
+
}
|
37 |
+
}
|
38 |
+
|
39 |
+
template <typename scalar_t>
|
40 |
+
static void correlate_patch_grad(
|
41 |
+
TensorAccessor<scalar_t,3> input1,
|
42 |
+
TensorAccessor<scalar_t,3> gradInput1,
|
43 |
+
TensorAccessor<scalar_t,3> input2,
|
44 |
+
TensorAccessor<scalar_t,3> gradInput2,
|
45 |
+
scalar_t gradOutput,
|
46 |
+
int kH, int kW,
|
47 |
+
int dilationH, int dilationW,
|
48 |
+
int u, int v,
|
49 |
+
int shiftU, int shiftV){
|
50 |
+
|
51 |
+
const int C = input1.size(0);
|
52 |
+
const int iH = input1.size(1);
|
53 |
+
const int iW = input1.size(2);
|
54 |
+
|
55 |
+
for (int c=0; c<C; ++c){
|
56 |
+
for (int i=0; i<kH; ++i){
|
57 |
+
int i1 = u + i * dilationH;
|
58 |
+
int i2 = i1 + shiftU;
|
59 |
+
if WITHIN_BOUNDS(i1, i2, iH, iH){
|
60 |
+
for (int j=0; j<kW; ++j){
|
61 |
+
int j1 = v + j * dilationW;
|
62 |
+
int j2 = j1 + shiftV;
|
63 |
+
if WITHIN_BOUNDS(j1, j2, iW, iW){
|
64 |
+
scalar_t v1 = input1[c][i1][j1];
|
65 |
+
scalar_t v2 = input2[c][i2][j2];
|
66 |
+
gradInput2[c][i2][j2] += gradOutput * v1;
|
67 |
+
gradInput1[c][i1][j1] += gradOutput * v2;
|
68 |
+
}
|
69 |
+
}
|
70 |
+
}
|
71 |
+
}
|
72 |
+
}
|
73 |
+
}
|
74 |
+
|
75 |
+
torch::Tensor correlation_cpp_forward(
|
76 |
+
torch::Tensor input1,
|
77 |
+
torch::Tensor input2,
|
78 |
+
int kH, int kW,
|
79 |
+
int patchH, int patchW,
|
80 |
+
int padH, int padW,
|
81 |
+
int dilationH, int dilationW,
|
82 |
+
int dilation_patchH, int dilation_patchW,
|
83 |
+
int dH, int dW) {
|
84 |
+
|
85 |
+
const auto batch_size = input1.size(0);
|
86 |
+
const auto iH = input1.size(2);
|
87 |
+
const auto iW = input1.size(3);
|
88 |
+
const int patchRadH = (patchH - 1) / 2;
|
89 |
+
const int patchRadW = (patchW - 1) / 2;
|
90 |
+
const int dilatedKH = (kH - 1) * dilationH + 1;
|
91 |
+
const int dilatedKW = (kW - 1) * dilationW + 1;
|
92 |
+
|
93 |
+
const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
|
94 |
+
const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;
|
95 |
+
auto output = at::zeros({batch_size, patchH, patchW, oH, oW}, input1.options());
|
96 |
+
|
97 |
+
int n, ph, pw, h, w;
|
98 |
+
#pragma omp parallel for private(n, ph, pw, h, w) collapse(2)
|
99 |
+
for (n = 0; n < batch_size; ++n) {
|
100 |
+
for(ph = 0; ph < patchH; ++ph){
|
101 |
+
for(pw = 0; pw < patchW; ++pw){
|
102 |
+
AT_DISPATCH_FLOATING_TYPES(input1.scalar_type(), "correlation_forward_cpp", ([&] {
|
103 |
+
auto input1_acc = input1.accessor<scalar_t, 4>();
|
104 |
+
auto input2_acc = input2.accessor<scalar_t, 4>();
|
105 |
+
auto output_acc = output.accessor<scalar_t, 5>();
|
106 |
+
for (h = 0; h < oH; ++h) {
|
107 |
+
for (w = 0; w < oW; ++w) {
|
108 |
+
correlate_patch(input1_acc[n],
|
109 |
+
input2_acc[n],
|
110 |
+
&output_acc[n][ph][pw][h][w],
|
111 |
+
kH, kW,
|
112 |
+
dilationH, dilationW,
|
113 |
+
-padH + h * dH,
|
114 |
+
-padW + w * dW,
|
115 |
+
(ph - patchRadH) * dilation_patchH,
|
116 |
+
(pw - patchRadW) * dilation_patchW);
|
117 |
+
}
|
118 |
+
}
|
119 |
+
}));
|
120 |
+
}
|
121 |
+
}
|
122 |
+
}
|
123 |
+
return output;
|
124 |
+
}
|
125 |
+
|
126 |
+
std::vector<torch::Tensor> correlation_cpp_backward(
|
127 |
+
torch::Tensor input1,
|
128 |
+
torch::Tensor input2,
|
129 |
+
torch::Tensor gradOutput,
|
130 |
+
int kH, int kW,
|
131 |
+
int patchH, int patchW,
|
132 |
+
int padH, int padW,
|
133 |
+
int dilationH, int dilationW,
|
134 |
+
int dilation_patchH, int dilation_patchW,
|
135 |
+
int dH, int dW) {
|
136 |
+
|
137 |
+
const int batch_size = input1.size(0);
|
138 |
+
const int patchRadH = (patchH - 1) / 2;
|
139 |
+
const int patchRadW = (patchW - 1) / 2;
|
140 |
+
const int oH = gradOutput.size(3);
|
141 |
+
const int oW = gradOutput.size(4);
|
142 |
+
|
143 |
+
auto gradInput1 = torch::zeros_like(input1);
|
144 |
+
|
145 |
+
auto gradInput2 = torch::zeros_like(input2);
|
146 |
+
|
147 |
+
int n, ph, pw, h, w;
|
148 |
+
#pragma omp parallel for private(n, ph, pw, h, w)
|
149 |
+
for (n = 0; n < batch_size; ++n) {
|
150 |
+
AT_DISPATCH_FLOATING_TYPES(input1.scalar_type(), "correlation_backward_cpp", ([&] {
|
151 |
+
auto input1_acc = input1.accessor<scalar_t, 4>();
|
152 |
+
auto gradInput1_acc = gradInput1.accessor<scalar_t, 4>();
|
153 |
+
auto input2_acc = input2.accessor<scalar_t, 4>();
|
154 |
+
auto gradInput2_acc = gradInput2.accessor<scalar_t, 4>();
|
155 |
+
auto gradOutput_acc = gradOutput.accessor<scalar_t, 5>();
|
156 |
+
|
157 |
+
for(ph = 0; ph < patchH; ++ph){
|
158 |
+
for(pw = 0; pw < patchW; ++pw){
|
159 |
+
for (h = 0; h < oH; ++h) {
|
160 |
+
for (w = 0; w < oW; ++w) {
|
161 |
+
correlate_patch_grad(input1_acc[n], gradInput1_acc[n],
|
162 |
+
input2_acc[n], gradInput2_acc[n],
|
163 |
+
gradOutput_acc[n][ph][pw][h][w],
|
164 |
+
kH, kW,
|
165 |
+
dilationH, dilationW,
|
166 |
+
-padH + h * dH,
|
167 |
+
-padW + w * dW,
|
168 |
+
(ph - patchRadH) * dilation_patchH,
|
169 |
+
(pw - patchRadW) * dilation_patchW);
|
170 |
+
}
|
171 |
+
}
|
172 |
+
}
|
173 |
+
}
|
174 |
+
}));
|
175 |
+
}
|
176 |
+
|
177 |
+
return {gradInput1, gradInput2};
|
178 |
+
}
|
Pytorch-Correlation-extension/Correlation_Module/correlation_cuda_kernel.cu
ADDED
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <torch/types.h>
|
2 |
+
using namespace torch;
|
3 |
+
|
4 |
+
#include <cuda.h>
|
5 |
+
#include <cuda_runtime.h>
|
6 |
+
|
7 |
+
#include <vector>
|
8 |
+
#include <iostream>
|
9 |
+
|
10 |
+
// Cuda tensor accessor definitions
|
11 |
+
// restrict pointer traits piroritize speed over memory consumption
|
12 |
+
#define TensorAcc4R PackedTensorAccessor32<scalar_t,4,RestrictPtrTraits>
|
13 |
+
#define TensorAcc5R PackedTensorAccessor32<scalar_t,5,RestrictPtrTraits>
|
14 |
+
#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
|
15 |
+
|
16 |
+
#define THREADS_FORWARD 32
|
17 |
+
#define THREADS_BACKWARD 5
|
18 |
+
|
19 |
+
|
20 |
+
namespace corr {
|
21 |
+
template <typename scalar_t>
|
22 |
+
__global__ void correlation_cuda_forward_kernel(
|
23 |
+
const TensorAcc4R rInput1,
|
24 |
+
const TensorAcc4R rInput2,
|
25 |
+
TensorAcc5R output,
|
26 |
+
int kH, int kW,
|
27 |
+
int patchH, int patchW,
|
28 |
+
int padH, int padW,
|
29 |
+
int dilationH, int dilationW,
|
30 |
+
int dilation_patchH, int dilation_patchW,
|
31 |
+
int dH, int dW) {
|
32 |
+
|
33 |
+
const int iH = rInput1.size(1);
|
34 |
+
const int iW = rInput1.size(2);
|
35 |
+
const int C = rInput1.size(3);
|
36 |
+
|
37 |
+
const int n = blockIdx.x;
|
38 |
+
const int h = blockIdx.y;
|
39 |
+
const int w = blockIdx.z;
|
40 |
+
const int thread = threadIdx.x;
|
41 |
+
|
42 |
+
const int start_i = -padH + h * dH;
|
43 |
+
const int start_j = -padW + w * dW;
|
44 |
+
|
45 |
+
const int patchRadH = dilation_patchH * (patchH - 1) / 2;
|
46 |
+
const int patchRadW = dilation_patchW * (patchW - 1) / 2;
|
47 |
+
|
48 |
+
__shared__ scalar_t prod_sum[THREADS_FORWARD];
|
49 |
+
|
50 |
+
for(int ph = 0; ph < patchH; ++ph){
|
51 |
+
int ph_dilated = ph * dilation_patchH - patchRadH;
|
52 |
+
for(int pw = 0; pw < patchW; ++pw){
|
53 |
+
int pw_dilated = pw * dilation_patchW - patchRadW;
|
54 |
+
prod_sum[thread] = 0;
|
55 |
+
for (int i=0; i<kH; ++i){
|
56 |
+
int i1 = start_i + i * dilationH;
|
57 |
+
int i2 = i1 + ph_dilated;
|
58 |
+
if WITHIN_BOUNDS(i1, i2, iH, iH){
|
59 |
+
for (int j=0; j<kW; ++j){
|
60 |
+
int j1 = start_j + j * dilationW;
|
61 |
+
int j2 = j1 + pw_dilated;
|
62 |
+
if WITHIN_BOUNDS(j1, j2, iW, iW){
|
63 |
+
for (int c=thread; c<C; c += THREADS_FORWARD){
|
64 |
+
scalar_t v1 = rInput1[n][i1][j1][c];
|
65 |
+
scalar_t v2 = rInput2[n][i2][j2][c];
|
66 |
+
prod_sum[thread] += v1 * v2;
|
67 |
+
}
|
68 |
+
}
|
69 |
+
}
|
70 |
+
}
|
71 |
+
}
|
72 |
+
// accumulate
|
73 |
+
__syncthreads();
|
74 |
+
if (thread == 0) {
|
75 |
+
scalar_t reduce_sum = 0;
|
76 |
+
for (int index = 0; index < THREADS_FORWARD; ++index) {
|
77 |
+
reduce_sum += prod_sum[index];
|
78 |
+
}
|
79 |
+
output[n][ph][pw][h][w] = reduce_sum;
|
80 |
+
}
|
81 |
+
}
|
82 |
+
}
|
83 |
+
}
|
84 |
+
|
85 |
+
|
86 |
+
template <typename scalar_t>
|
87 |
+
__global__ void correlation_cuda_backward_kernel_input1(
|
88 |
+
const TensorAcc5R gradOutput,
|
89 |
+
const TensorAcc4R input2,
|
90 |
+
TensorAcc4R gradInput1,
|
91 |
+
const int kH, const int kW,
|
92 |
+
const int patchH, const int patchW,
|
93 |
+
const int padH, const int padW,
|
94 |
+
const int dilationH, const int dilationW,
|
95 |
+
const int dilation_patchH, const int dilation_patchW,
|
96 |
+
const int dH, const int dW,
|
97 |
+
const int batch) {
|
98 |
+
const int iH = input2.size(2);
|
99 |
+
const int iW = input2.size(3);
|
100 |
+
|
101 |
+
const int H = gradOutput.size(3);
|
102 |
+
const int W = gradOutput.size(4);
|
103 |
+
|
104 |
+
const int patchRadH = (patchH - 1) / 2;
|
105 |
+
const int patchRadW = (patchW - 1) / 2;
|
106 |
+
|
107 |
+
const int n = batch;
|
108 |
+
const int c = blockIdx.x;
|
109 |
+
const int h = blockIdx.y;
|
110 |
+
const int w = blockIdx.z;
|
111 |
+
const int ph_off = threadIdx.x;
|
112 |
+
const int pw_off = threadIdx.y;
|
113 |
+
|
114 |
+
const int h_2 = h + padH;
|
115 |
+
const int w_2 = w + padW;
|
116 |
+
const int min_h = h_2 - kH * dilationH;
|
117 |
+
const int min_w = w_2 - kW * dilationW;
|
118 |
+
|
119 |
+
__shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
|
120 |
+
prod_sum[ph_off][pw_off] = 0;
|
121 |
+
|
122 |
+
for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
|
123 |
+
int i1 = h + dilation_patchH * (ph - patchRadH);
|
124 |
+
for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
|
125 |
+
int j1 = w + dilation_patchW * (pw - patchRadW);
|
126 |
+
if (WITHIN_BOUNDS(i1, j1, iH, iW)){
|
127 |
+
scalar_t val = input2[n][c][i1][j1];
|
128 |
+
for(int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
|
129 |
+
int i2 = (h_3)/dH;
|
130 |
+
if (i2 * dH != h_3)
|
131 |
+
continue;
|
132 |
+
for(int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
|
133 |
+
int j2 = (w_3) / dW;
|
134 |
+
if(j2 * dW != w_3)
|
135 |
+
continue;
|
136 |
+
if WITHIN_BOUNDS(i2, j2, H, W) {
|
137 |
+
prod_sum[ph_off][pw_off] += gradOutput[n][ph][pw][i2][j2] * val;
|
138 |
+
}
|
139 |
+
}
|
140 |
+
}
|
141 |
+
}
|
142 |
+
}
|
143 |
+
}
|
144 |
+
|
145 |
+
__syncthreads();
|
146 |
+
|
147 |
+
if (ph_off == 0 && pw_off == 0){
|
148 |
+
scalar_t reduce_sum =0;
|
149 |
+
for (int ph = 0; ph < THREADS_BACKWARD; ++ph){
|
150 |
+
for (int pw = 0; pw < THREADS_BACKWARD; ++pw){
|
151 |
+
reduce_sum += prod_sum[ph][pw];
|
152 |
+
}
|
153 |
+
}
|
154 |
+
gradInput1[n][c][h][w] = reduce_sum;
|
155 |
+
}
|
156 |
+
}
|
157 |
+
|
158 |
+
|
159 |
+
template <typename scalar_t>
|
160 |
+
__global__ void correlation_cuda_backward_kernel_input2(
|
161 |
+
const TensorAcc5R gradOutput,
|
162 |
+
const TensorAcc4R input1,
|
163 |
+
TensorAcc4R gradInput2,
|
164 |
+
int kH, int kW,
|
165 |
+
int patchH, int patchW,
|
166 |
+
int padH, int padW,
|
167 |
+
int dilationH, int dilationW,
|
168 |
+
int dilation_patchH, int dilation_patchW,
|
169 |
+
int dH, int dW,
|
170 |
+
int batch) {
|
171 |
+
const int iH = input1.size(2);
|
172 |
+
const int iW = input1.size(3);
|
173 |
+
|
174 |
+
const int patchRadH = (patchH - 1) / 2;
|
175 |
+
const int patchRadW = (patchW - 1) / 2;
|
176 |
+
|
177 |
+
const int H = gradOutput.size(3);
|
178 |
+
const int W = gradOutput.size(4);
|
179 |
+
|
180 |
+
const int dilatedKH = kH * dilationH;
|
181 |
+
const int dilatedKW = kW * dilationW;
|
182 |
+
|
183 |
+
const int n = batch;
|
184 |
+
const int c = blockIdx.x;
|
185 |
+
const int h = blockIdx.y;
|
186 |
+
const int w = blockIdx.z;
|
187 |
+
const int ph_off = threadIdx.x;
|
188 |
+
const int pw_off = threadIdx.y;
|
189 |
+
|
190 |
+
__shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
|
191 |
+
prod_sum[ph_off][pw_off] = 0;
|
192 |
+
|
193 |
+
for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
|
194 |
+
int i1 = h - dilation_patchH * (ph - patchRadH);
|
195 |
+
for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
|
196 |
+
int j1 = w - dilation_patchW * (pw - patchRadW);
|
197 |
+
if WITHIN_BOUNDS(i1, j1, iH, iW) {
|
198 |
+
scalar_t val = input1[n][c][i1][j1];
|
199 |
+
|
200 |
+
const int h_2 = i1 + padH;
|
201 |
+
const int w_2 = j1 + padW;
|
202 |
+
const int min_h = h_2 - dilatedKH;
|
203 |
+
const int min_w = w_2 - dilatedKW;
|
204 |
+
|
205 |
+
for(int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
|
206 |
+
int i2 = (h_3)/dH;
|
207 |
+
if (i2 * dH != h_3)
|
208 |
+
continue;
|
209 |
+
for(int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
|
210 |
+
int j2 = (w_3) / dW;
|
211 |
+
if(j2 * dW != w_3)
|
212 |
+
continue;
|
213 |
+
if WITHIN_BOUNDS(i2, j2, H, W) {
|
214 |
+
prod_sum[ph_off][pw_off] += gradOutput[n][ph][pw][i2][j2] * val;
|
215 |
+
}
|
216 |
+
}
|
217 |
+
}
|
218 |
+
}
|
219 |
+
}
|
220 |
+
}
|
221 |
+
|
222 |
+
__syncthreads();
|
223 |
+
|
224 |
+
if (ph_off == 0 && pw_off == 0){
|
225 |
+
scalar_t reduce_sum =0;
|
226 |
+
for (int ph = 0; ph < THREADS_BACKWARD; ++ph){
|
227 |
+
for (int pw = 0; pw < THREADS_BACKWARD; ++pw){
|
228 |
+
reduce_sum += prod_sum[ph][pw];
|
229 |
+
}
|
230 |
+
}
|
231 |
+
gradInput2[n][c][h][w] = reduce_sum;
|
232 |
+
}
|
233 |
+
}
|
234 |
+
} // namsepace corr
|
235 |
+
|
236 |
+
torch::Tensor correlation_cuda_forward(
|
237 |
+
torch::Tensor input1,
|
238 |
+
torch::Tensor input2,
|
239 |
+
int kH, int kW,
|
240 |
+
int patchH, int patchW,
|
241 |
+
int padH, int padW,
|
242 |
+
int dilationH, int dilationW,
|
243 |
+
int dilation_patchH, int dilation_patchW,
|
244 |
+
int dH, int dW) {
|
245 |
+
|
246 |
+
const int batch_size = input1.size(0);
|
247 |
+
const int iH = input1.size(2);
|
248 |
+
const int iW = input1.size(3);
|
249 |
+
const int dilatedKH = (kH - 1) * dilationH + 1;
|
250 |
+
const int dilatedKW = (kW - 1) * dilationW + 1;
|
251 |
+
|
252 |
+
const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
|
253 |
+
const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;
|
254 |
+
auto output = torch::zeros({batch_size, patchH, patchW, oH, oW}, input1.options());
|
255 |
+
|
256 |
+
auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
|
257 |
+
auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
|
258 |
+
|
259 |
+
const int threads = THREADS_FORWARD;
|
260 |
+
const dim3 blocks(batch_size, oH, oW);
|
261 |
+
|
262 |
+
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.scalar_type(), "correlation_forward_cuda", ([&] {
|
263 |
+
TensorAcc4R trInput1_acc = trInput1.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
|
264 |
+
TensorAcc4R trInput2_acc = trInput2.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
|
265 |
+
TensorAcc5R output_acc = output.packed_accessor32<scalar_t,5,RestrictPtrTraits>();
|
266 |
+
corr::correlation_cuda_forward_kernel<scalar_t><<<blocks, threads>>>(
|
267 |
+
trInput1_acc, trInput2_acc, output_acc,
|
268 |
+
kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
|
269 |
+
dilation_patchH, dilation_patchW, dH, dW);
|
270 |
+
}));
|
271 |
+
|
272 |
+
return output;
|
273 |
+
}
|
274 |
+
|
275 |
+
std::vector<torch::Tensor> correlation_cuda_backward(
|
276 |
+
torch::Tensor input1,
|
277 |
+
torch::Tensor input2,
|
278 |
+
torch::Tensor gradOutput,
|
279 |
+
int kH, int kW,
|
280 |
+
int patchH, int patchW,
|
281 |
+
int padH, int padW,
|
282 |
+
int dilationH, int dilationW,
|
283 |
+
int dilation_patchH, int dilation_patchW,
|
284 |
+
int dH, int dW) {
|
285 |
+
|
286 |
+
auto gradInput1 = torch::zeros_like(input1);
|
287 |
+
auto gradInput2 = torch::zeros_like(input2);
|
288 |
+
|
289 |
+
const int batch_size = input1.size(0);
|
290 |
+
const int iH = input1.size(2);
|
291 |
+
const int iW = input1.size(3);
|
292 |
+
const int C = input1.size(1);
|
293 |
+
|
294 |
+
const dim3 blocks(C, iH, iW);
|
295 |
+
const dim3 threads(THREADS_BACKWARD, THREADS_BACKWARD);
|
296 |
+
|
297 |
+
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.scalar_type(), "correlation_backward_cuda", ([&] {
|
298 |
+
TensorAcc4R input1_acc = input1.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
|
299 |
+
TensorAcc4R input2_acc = input2.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
|
300 |
+
TensorAcc4R gradInput1_acc = gradInput1.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
|
301 |
+
TensorAcc4R gradInput2_acc = gradInput2.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
|
302 |
+
TensorAcc5R gradOutput_acc = gradOutput.packed_accessor32<scalar_t,5,RestrictPtrTraits>();
|
303 |
+
|
304 |
+
|
305 |
+
for (int n = 0; n < batch_size; ++n){
|
306 |
+
corr::correlation_cuda_backward_kernel_input1<scalar_t><<<blocks, threads>>>(
|
307 |
+
gradOutput_acc, input2_acc, gradInput1_acc,
|
308 |
+
kH, kW, patchH, patchW, padH, padW,
|
309 |
+
dilationH, dilationW,
|
310 |
+
dilation_patchH, dilation_patchW,
|
311 |
+
dH, dW,
|
312 |
+
n);
|
313 |
+
}
|
314 |
+
|
315 |
+
for (int n = 0; n < batch_size; ++n){
|
316 |
+
corr::correlation_cuda_backward_kernel_input2<scalar_t><<<blocks, threads>>>(
|
317 |
+
gradOutput_acc, input1_acc, gradInput2_acc,
|
318 |
+
kH, kW, patchH, patchW, padH, padW,
|
319 |
+
dilationH, dilationW,
|
320 |
+
dilation_patchH, dilation_patchW,
|
321 |
+
dH, dW,
|
322 |
+
n);
|
323 |
+
}
|
324 |
+
}));
|
325 |
+
|
326 |
+
return {gradInput1, gradInput2};
|
327 |
+
}
|
Pytorch-Correlation-extension/Correlation_Module/correlation_sampler.cpp
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include <torch/extension.h>
|
2 |
+
#include <c10/cuda/CUDAGuard.h>
|
3 |
+
#include <vector>
|
4 |
+
#include <iostream>
|
5 |
+
|
6 |
+
// declarations
|
7 |
+
|
8 |
+
torch::Tensor correlation_cpp_forward(
|
9 |
+
torch::Tensor input1,
|
10 |
+
torch::Tensor input2,
|
11 |
+
int kH, int kW,
|
12 |
+
int patchH, int patchW,
|
13 |
+
int padH, int padW,
|
14 |
+
int dilationH, int dilationW,
|
15 |
+
int dilation_patchH, int dilation_patchW,
|
16 |
+
int dH, int dW);
|
17 |
+
|
18 |
+
std::vector<torch::Tensor> correlation_cpp_backward(
|
19 |
+
torch::Tensor grad_output,
|
20 |
+
torch::Tensor input1,
|
21 |
+
torch::Tensor input2,
|
22 |
+
int kH, int kW,
|
23 |
+
int patchH, int patchW,
|
24 |
+
int padH, int padW,
|
25 |
+
int dilationH, int dilationW,
|
26 |
+
int dilation_patchH, int dilation_patchW,
|
27 |
+
int dH, int dW);
|
28 |
+
|
29 |
+
#ifdef USE_CUDA
|
30 |
+
|
31 |
+
#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDA tensor")
|
32 |
+
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous")
|
33 |
+
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
|
34 |
+
#define CHECK_SAME_DEVICE(x, y) TORCH_CHECK(x.device() == y.device(), #x " is not on same device as " #y)
|
35 |
+
|
36 |
+
torch::Tensor correlation_cuda_forward(
|
37 |
+
torch::Tensor input1,
|
38 |
+
torch::Tensor input2,
|
39 |
+
int kH, int kW,
|
40 |
+
int patchH, int patchW,
|
41 |
+
int padH, int padW,
|
42 |
+
int dilationH, int dilationW,
|
43 |
+
int dilation_patchH, int dilation_patchW,
|
44 |
+
int dH, int dW);
|
45 |
+
|
46 |
+
std::vector<torch::Tensor> correlation_cuda_backward(
|
47 |
+
torch::Tensor grad_output,
|
48 |
+
torch::Tensor input1,
|
49 |
+
torch::Tensor input2,
|
50 |
+
int kH, int kW,
|
51 |
+
int patchH, int patchW,
|
52 |
+
int padH, int padW,
|
53 |
+
int dilationH, int dilationW,
|
54 |
+
int dilation_patchH, int dilation_patchW,
|
55 |
+
int dH, int dW);
|
56 |
+
|
57 |
+
// C++ interface
|
58 |
+
|
59 |
+
torch::Tensor correlation_sample_forward(
|
60 |
+
torch::Tensor input1,
|
61 |
+
torch::Tensor input2,
|
62 |
+
int kH, int kW,
|
63 |
+
int patchH, int patchW,
|
64 |
+
int padH, int padW,
|
65 |
+
int dilationH, int dilationW,
|
66 |
+
int dilation_patchH, int dilation_patchW,
|
67 |
+
int dH, int dW) {
|
68 |
+
if (input1.device().is_cuda()){
|
69 |
+
CHECK_INPUT(input1);
|
70 |
+
CHECK_INPUT(input2);
|
71 |
+
|
72 |
+
// set device of input1 as default CUDA device
|
73 |
+
// https://pytorch.org/cppdocs/api/structc10_1_1cuda_1_1_optional_c_u_d_a_guard.html
|
74 |
+
const at::cuda::OptionalCUDAGuard guard_input1(device_of(input1));
|
75 |
+
CHECK_SAME_DEVICE(input1, input2);
|
76 |
+
|
77 |
+
return correlation_cuda_forward(input1, input2, kH, kW, patchH, patchW,
|
78 |
+
padH, padW, dilationH, dilationW,
|
79 |
+
dilation_patchH, dilation_patchW,
|
80 |
+
dH, dW);
|
81 |
+
}else{
|
82 |
+
return correlation_cpp_forward(input1, input2, kH, kW, patchH, patchW,
|
83 |
+
padH, padW, dilationH, dilationW,
|
84 |
+
dilation_patchH, dilation_patchW,
|
85 |
+
dH, dW);
|
86 |
+
}
|
87 |
+
}
|
88 |
+
|
89 |
+
std::vector<torch::Tensor> correlation_sample_backward(
|
90 |
+
torch::Tensor input1,
|
91 |
+
torch::Tensor input2,
|
92 |
+
torch::Tensor grad_output,
|
93 |
+
int kH, int kW,
|
94 |
+
int patchH, int patchW,
|
95 |
+
int padH, int padW,
|
96 |
+
int dilationH, int dilationW,
|
97 |
+
int dilation_patchH, int dilation_patchW,
|
98 |
+
int dH, int dW) {
|
99 |
+
|
100 |
+
if(grad_output.device().is_cuda()){
|
101 |
+
CHECK_INPUT(input1);
|
102 |
+
CHECK_INPUT(input2);
|
103 |
+
|
104 |
+
// set device of input1 as default CUDA device
|
105 |
+
const at::cuda::OptionalCUDAGuard guard_input1(device_of(input1));
|
106 |
+
CHECK_SAME_DEVICE(input1, input2);
|
107 |
+
CHECK_SAME_DEVICE(input1, grad_output);
|
108 |
+
|
109 |
+
return correlation_cuda_backward(input1, input2, grad_output,
|
110 |
+
kH, kW, patchH, patchW,
|
111 |
+
padH, padW,
|
112 |
+
dilationH, dilationW,
|
113 |
+
dilation_patchH, dilation_patchW,
|
114 |
+
dH, dW);
|
115 |
+
}else{
|
116 |
+
return correlation_cpp_backward(
|
117 |
+
input1, input2, grad_output,
|
118 |
+
kH, kW, patchH, patchW,
|
119 |
+
padH, padW,
|
120 |
+
dilationH, dilationW,
|
121 |
+
dilation_patchH, dilation_patchW,
|
122 |
+
dH, dW);
|
123 |
+
}
|
124 |
+
}
|
125 |
+
|
126 |
+
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
127 |
+
m.def("forward", &correlation_sample_forward, "Spatial Correlation Sampler Forward");
|
128 |
+
m.def("backward", &correlation_sample_backward, "Spatial Correlation Sampler backward");
|
129 |
+
}
|
130 |
+
|
131 |
+
#else
|
132 |
+
|
133 |
+
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
134 |
+
m.def("forward", &correlation_cpp_forward, "Spatial Correlation Sampler Forward");
|
135 |
+
m.def("backward", &correlation_cpp_backward, "Spatial Correlation Sampler backward");
|
136 |
+
}
|
137 |
+
|
138 |
+
#endif
|
Pytorch-Correlation-extension/Correlation_Module/spatial_correlation_sampler/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .spatial_correlation_sampler import SpatialCorrelationSampler, spatial_correlation_sample
|
Pytorch-Correlation-extension/Correlation_Module/spatial_correlation_sampler/spatial_correlation_sampler.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import nn
|
2 |
+
from torch.autograd import Function
|
3 |
+
from torch.autograd.function import once_differentiable
|
4 |
+
from torch.nn.modules.utils import _pair
|
5 |
+
|
6 |
+
import spatial_correlation_sampler_backend as correlation
|
7 |
+
|
8 |
+
|
9 |
+
def spatial_correlation_sample(input1,
|
10 |
+
input2,
|
11 |
+
kernel_size=1,
|
12 |
+
patch_size=1,
|
13 |
+
stride=1,
|
14 |
+
padding=0,
|
15 |
+
dilation=1,
|
16 |
+
dilation_patch=1):
|
17 |
+
"""Apply spatial correlation sampling on from input1 to input2,
|
18 |
+
|
19 |
+
Every parameter except input1 and input2 can be either single int
|
20 |
+
or a pair of int. For more information about Spatial Correlation
|
21 |
+
Sampling, see this page.
|
22 |
+
https://lmb.informatik.uni-freiburg.de/Publications/2015/DFIB15/
|
23 |
+
|
24 |
+
Args:
|
25 |
+
input1 : The first parameter.
|
26 |
+
input2 : The second parameter.
|
27 |
+
kernel_size : total size of your correlation kernel, in pixels
|
28 |
+
patch_size : total size of your patch, determining how many
|
29 |
+
different shifts will be applied
|
30 |
+
stride : stride of the spatial sampler, will modify output
|
31 |
+
height and width
|
32 |
+
padding : padding applied to input1 and input2 before applying
|
33 |
+
the correlation sampling, will modify output height and width
|
34 |
+
dilation_patch : step for every shift in patch
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
Tensor: Result of correlation sampling
|
38 |
+
|
39 |
+
"""
|
40 |
+
return SpatialCorrelationSamplerFunction.apply(input1, input2,
|
41 |
+
kernel_size, patch_size,
|
42 |
+
stride, padding, dilation, dilation_patch)
|
43 |
+
|
44 |
+
|
45 |
+
class SpatialCorrelationSamplerFunction(Function):
|
46 |
+
|
47 |
+
@staticmethod
|
48 |
+
def forward(ctx,
|
49 |
+
input1,
|
50 |
+
input2,
|
51 |
+
kernel_size=1,
|
52 |
+
patch_size=1,
|
53 |
+
stride=1,
|
54 |
+
padding=0,
|
55 |
+
dilation=1,
|
56 |
+
dilation_patch=1):
|
57 |
+
|
58 |
+
ctx.save_for_backward(input1, input2)
|
59 |
+
kH, kW = ctx.kernel_size = _pair(kernel_size)
|
60 |
+
patchH, patchW = ctx.patch_size = _pair(patch_size)
|
61 |
+
padH, padW = ctx.padding = _pair(padding)
|
62 |
+
dilationH, dilationW = ctx.dilation = _pair(dilation)
|
63 |
+
dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(dilation_patch)
|
64 |
+
dH, dW = ctx.stride = _pair(stride)
|
65 |
+
|
66 |
+
output = correlation.forward(input1, input2,
|
67 |
+
kH, kW, patchH, patchW,
|
68 |
+
padH, padW, dilationH, dilationW,
|
69 |
+
dilation_patchH, dilation_patchW,
|
70 |
+
dH, dW)
|
71 |
+
|
72 |
+
return output
|
73 |
+
|
74 |
+
@staticmethod
|
75 |
+
@once_differentiable
|
76 |
+
def backward(ctx, grad_output):
|
77 |
+
input1, input2 = ctx.saved_variables
|
78 |
+
|
79 |
+
kH, kW = ctx.kernel_size
|
80 |
+
patchH, patchW = ctx.patch_size
|
81 |
+
padH, padW = ctx.padding
|
82 |
+
dilationH, dilationW = ctx.dilation
|
83 |
+
dilation_patchH, dilation_patchW = ctx.dilation_patch
|
84 |
+
dH, dW = ctx.stride
|
85 |
+
|
86 |
+
grad_input1, grad_input2 = correlation.backward(input1, input2, grad_output,
|
87 |
+
kH, kW, patchH, patchW,
|
88 |
+
padH, padW, dilationH, dilationW,
|
89 |
+
dilation_patchH, dilation_patchW,
|
90 |
+
dH, dW)
|
91 |
+
return grad_input1, grad_input2, None, None, None, None, None, None
|
92 |
+
|
93 |
+
|
94 |
+
class SpatialCorrelationSampler(nn.Module):
|
95 |
+
def __init__(self, kernel_size=1, patch_size=1, stride=1, padding=0, dilation=1, dilation_patch=1):
|
96 |
+
super(SpatialCorrelationSampler, self).__init__()
|
97 |
+
self.kernel_size = kernel_size
|
98 |
+
self.patch_size = patch_size
|
99 |
+
self.stride = stride
|
100 |
+
self.padding = padding
|
101 |
+
self.dilation = dilation
|
102 |
+
self.dilation_patch = dilation_patch
|
103 |
+
|
104 |
+
def forward(self, input1, input2):
|
105 |
+
return SpatialCorrelationSamplerFunction.apply(input1, input2, self.kernel_size,
|
106 |
+
self.patch_size, self.stride,
|
107 |
+
self.padding, self.dilation, self.dilation_patch)
|
Pytorch-Correlation-extension/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) [year] [fullname]
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
Pytorch-Correlation-extension/README.md
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
[![PyPI](https://img.shields.io/pypi/v/spatial-correlation-sampler.svg)](https://pypi.org/project/spatial-correlation-sampler/)
|
3 |
+
|
4 |
+
|
5 |
+
# Pytorch Correlation module
|
6 |
+
|
7 |
+
this is a custom C++/Cuda implementation of Correlation module, used e.g. in [FlowNetC](https://arxiv.org/abs/1504.06852)
|
8 |
+
|
9 |
+
This [tutorial](http://pytorch.org/tutorials/advanced/cpp_extension.html) was used as a basis for implementation, as well as
|
10 |
+
[NVIDIA's cuda code](https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)
|
11 |
+
|
12 |
+
- Build and Install C++ and CUDA extensions by executing `python setup.py install`,
|
13 |
+
- Benchmark C++ vs. CUDA by running `python benchmark.py {cpu, cuda}`,
|
14 |
+
- Run gradient checks on the code by running `python grad_check.py --backend {cpu, cuda}`.
|
15 |
+
|
16 |
+
# Requirements
|
17 |
+
|
18 |
+
This module is expected to compile for Pytorch `2.1.0`.
|
19 |
+
|
20 |
+
Before installation please check compatibility of your GPU and CUDA (_Compute Capability_) [nvidia docs](https://developer.nvidia.com/cuda-gpus).
|
21 |
+
e.g RTX 6000 is using CC=8.9 so we are setting the environment variable to
|
22 |
+
|
23 |
+
`export TORCH_CUDA_ARCH_LIST="8.9+PTX"`
|
24 |
+
|
25 |
+
# Installation
|
26 |
+
|
27 |
+
be reminded this module requires `python3-dev` to compile C++ code, e.g. on Ubuntu run:
|
28 |
+
|
29 |
+
`apt install python3-dev`
|
30 |
+
|
31 |
+
this module is available on pip
|
32 |
+
|
33 |
+
`pip install spatial-correlation-sampler`
|
34 |
+
|
35 |
+
For a cpu-only version, you can install from source with
|
36 |
+
|
37 |
+
`python setup_cpu.py install`
|
38 |
+
|
39 |
+
# Known Problems
|
40 |
+
|
41 |
+
This module needs compatible gcc version and CUDA to be compiled.
|
42 |
+
Namely, CUDA 9.1 and below will need gcc5, while CUDA 9.2 and 10.0 will need gcc7
|
43 |
+
See [this issue](https://github.com/ClementPinard/Pytorch-Correlation-extension/issues/1) for more information
|
44 |
+
|
45 |
+
# Usage
|
46 |
+
|
47 |
+
API has a few difference with NVIDIA's module
|
48 |
+
* output is now a 5D tensor, which reflects the shifts horizontal and vertical.
|
49 |
+
```
|
50 |
+
input (B x C x H x W) -> output (B x PatchH x PatchW x oH x oW)
|
51 |
+
```
|
52 |
+
* Output sizes `oH` and `oW` are no longer dependant of patch size, but only of kernel size and padding
|
53 |
+
* Patch size `patch_size` is now the whole patch, and not only the radii.
|
54 |
+
* `stride1` is now `stride` and`stride2` is `dilation_patch`, which behave like dilated convolutions
|
55 |
+
* equivalent `max_displacement` is then `dilation_patch * (patch_size - 1) / 2`.
|
56 |
+
* `dilation` is a new parameter, it acts the same way as dilated convolution regarding the correlation kernel
|
57 |
+
* to get the right parameters for FlowNetC, you would have
|
58 |
+
```
|
59 |
+
kernel_size=1
|
60 |
+
patch_size=21,
|
61 |
+
stride=1,
|
62 |
+
padding=0,
|
63 |
+
dilation=1
|
64 |
+
dilation_patch=2
|
65 |
+
```
|
66 |
+
|
67 |
+
|
68 |
+
## Example
|
69 |
+
```python
|
70 |
+
import torch
|
71 |
+
from spatial_correlation_sampler import SpatialCorrelationSampler,
|
72 |
+
|
73 |
+
device = "cuda"
|
74 |
+
batch_size = 1
|
75 |
+
channel = 1
|
76 |
+
H = 10
|
77 |
+
W = 10
|
78 |
+
dtype = torch.float32
|
79 |
+
|
80 |
+
input1 = torch.randint(1, 4, (batch_size, channel, H, W), dtype=dtype, device=device, requires_grad=True)
|
81 |
+
input2 = torch.randint_like(input1, 1, 4).requires_grad_(True)
|
82 |
+
|
83 |
+
#You can either use the function or the module. Note that the module doesn't contain any parameter tensor.
|
84 |
+
|
85 |
+
#function
|
86 |
+
|
87 |
+
out = spatial_correlation_sample(input1,
|
88 |
+
input2,
|
89 |
+
kernel_size=3,
|
90 |
+
patch_size=1,
|
91 |
+
stride=2,
|
92 |
+
padding=0,
|
93 |
+
dilation=2,
|
94 |
+
dilation_patch=1)
|
95 |
+
|
96 |
+
#module
|
97 |
+
|
98 |
+
correlation_sampler = SpatialCorrelationSampler(
|
99 |
+
kernel_size=3,
|
100 |
+
patch_size=1,
|
101 |
+
stride=2,
|
102 |
+
padding=0,
|
103 |
+
dilation=2,
|
104 |
+
dilation_patch=1)
|
105 |
+
out = correlation_sampler(input1, input2)
|
106 |
+
|
107 |
+
```
|
108 |
+
|
109 |
+
# Benchmark
|
110 |
+
|
111 |
+
* default parameters are from `benchmark.py`, FlowNetC parameters are same as use in `FlowNetC` with a batch size of 4, described in [this paper](https://arxiv.org/abs/1504.06852), implemented [here](https://github.com/lmb-freiburg/flownet2) and [here](https://github.com/NVIDIA/flownet2-pytorch/blob/master/networks/FlowNetC.py).
|
112 |
+
* Feel free to file an issue to add entries to this with your hardware !
|
113 |
+
|
114 |
+
## CUDA Benchmark
|
115 |
+
|
116 |
+
* See [here](https://gist.github.com/ClementPinard/270e910147119831014932f67fb1b5ea) for a benchmark script working with [NVIDIA](https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)'s code, and Pytorch.
|
117 |
+
* Benchmark are launched with environment variable `CUDA_LAUNCH_BLOCKING` set to `1`.
|
118 |
+
* Only `float32` is benchmarked.
|
119 |
+
* FlowNetC correlation parameters where launched with the following command:
|
120 |
+
|
121 |
+
```bash
|
122 |
+
CUDA_LAUNCH_BLOCKING=1 python benchmark.py --scale ms -k1 --patch 21 -s1 -p0 --patch_dilation 2 -b4 --height 48 --width 64 -c256 cuda -d float
|
123 |
+
|
124 |
+
CUDA_LAUNCH_BLOCKING=1 python NV_correlation_benchmark.py --scale ms -k1 --patch 21 -s1 -p0 --patch_dilation 2 -b4 --height 48 --width 64 -c256
|
125 |
+
```
|
126 |
+
|
127 |
+
| implementation | Correlation parameters | device | pass | min time | avg time |
|
128 |
+
| -------------- | ---------------------- | ------- | -------- | ------------: | ------------: |
|
129 |
+
| ours | default | 980 GTX | forward | **5.745 ms** | **5.851 ms** |
|
130 |
+
| ours | default | 980 GTX | backward | 77.694 ms | 77.957 ms |
|
131 |
+
| NVIDIA | default | 980 GTX | forward | 13.779 ms | 13.853 ms |
|
132 |
+
| NVIDIA | default | 980 GTX | backward | **73.383 ms** | **73.708 ms** |
|
133 |
+
| | | | | | |
|
134 |
+
| ours | FlowNetC | 980 GTX | forward | **26.102 ms** | **26.179 ms** |
|
135 |
+
| ours | FlowNetC | 980 GTX | backward | **208.091 ms** | **208.510 ms** |
|
136 |
+
| NVIDIA | FlowNetC | 980 GTX | forward | 35.363 ms | 35.550 ms |
|
137 |
+
| NVIDIA | FlowNetC | 980 GTX | backward | 283.748 ms | 284.346 ms |
|
138 |
+
|
139 |
+
### Notes
|
140 |
+
* The overhead of our implementation regarding `kernel_size` > 1 during backward needs some investigation, feel free to
|
141 |
+
dive in the code to improve it !
|
142 |
+
* The backward pass of NVIDIA is not entirely correct when stride1 > 1 and kernel_size > 1, because not everything
|
143 |
+
is computed, see [here](https://github.com/NVIDIA/flownet2-pytorch/blob/master/networks/correlation_package/src/correlation_cuda_kernel.cu#L120).
|
144 |
+
|
145 |
+
## CPU Benchmark
|
146 |
+
|
147 |
+
* No other implementation is avalaible on CPU.
|
148 |
+
* It is obviously not recommended to run it on CPU if you have a GPU.
|
149 |
+
|
150 |
+
| Correlation parameters | device | pass | min time | avg time |
|
151 |
+
| ---------------------- | -------------------- | -------- | ----------: | ----------: |
|
152 |
+
| default | E5-2630 v3 @ 2.40GHz | forward | 159.616 ms | 188.727 ms |
|
153 |
+
| default | E5-2630 v3 @ 2.40GHz | backward | 282.641 ms | 294.194 ms |
|
154 |
+
| FlowNetC | E5-2630 v3 @ 2.40GHz | forward | 2.138 s | 2.144 s |
|
155 |
+
| FlowNetC | E5-2630 v3 @ 2.40GHz | backward | 7.006 s | 7.075 s |
|
Pytorch-Correlation-extension/benchmark.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import division
|
2 |
+
from __future__ import print_function
|
3 |
+
|
4 |
+
import argparse
|
5 |
+
import time
|
6 |
+
|
7 |
+
import torch
|
8 |
+
from spatial_correlation_sampler import SpatialCorrelationSampler
|
9 |
+
from tqdm import trange
|
10 |
+
|
11 |
+
TIME_SCALES = {'s': 1, 'ms': 1000, 'us': 1000000}
|
12 |
+
|
13 |
+
parser = argparse.ArgumentParser()
|
14 |
+
parser.add_argument('backend', choices=['cpu', 'cuda'], default='cuda')
|
15 |
+
parser.add_argument('-b', '--batch-size', type=int, default=16)
|
16 |
+
parser.add_argument('-k', '--kernel-size', type=int, default=3)
|
17 |
+
parser.add_argument('--patch', type=int, default=3)
|
18 |
+
parser.add_argument('--patch_dilation', type=int, default=2)
|
19 |
+
parser.add_argument('-c', '--channel', type=int, default=64)
|
20 |
+
parser.add_argument('--height', type=int, default=100)
|
21 |
+
parser.add_argument('-w', '--width', type=int, default=100)
|
22 |
+
parser.add_argument('-s', '--stride', type=int, default=2)
|
23 |
+
parser.add_argument('-p', '--pad', type=int, default=1)
|
24 |
+
parser.add_argument('--scale', choices=['s', 'ms', 'us'], default='us')
|
25 |
+
parser.add_argument('-r', '--runs', type=int, default=100)
|
26 |
+
parser.add_argument('--dilation', type=int, default=2)
|
27 |
+
parser.add_argument('-d', '--dtype', choices=['half', 'float', 'double'])
|
28 |
+
|
29 |
+
args = parser.parse_args()
|
30 |
+
|
31 |
+
device = torch.device(args.backend)
|
32 |
+
|
33 |
+
if args.dtype == 'half':
|
34 |
+
dtype = torch.float16
|
35 |
+
elif args.dtype == 'float':
|
36 |
+
dtype = torch.float32
|
37 |
+
else:
|
38 |
+
dtype = torch.float64
|
39 |
+
|
40 |
+
|
41 |
+
input1 = torch.randn(args.batch_size,
|
42 |
+
args.channel,
|
43 |
+
args.height,
|
44 |
+
args.width,
|
45 |
+
dtype=dtype,
|
46 |
+
device=device,
|
47 |
+
requires_grad=True)
|
48 |
+
input2 = torch.randn_like(input1)
|
49 |
+
|
50 |
+
correlation_sampler = SpatialCorrelationSampler(
|
51 |
+
args.kernel_size,
|
52 |
+
args.patch,
|
53 |
+
args.stride,
|
54 |
+
args.pad,
|
55 |
+
args.dilation,
|
56 |
+
args.patch_dilation)
|
57 |
+
|
58 |
+
# Force CUDA initialization
|
59 |
+
output = correlation_sampler(input1, input2)
|
60 |
+
print(output.size())
|
61 |
+
output.mean().backward()
|
62 |
+
forward_min = float('inf')
|
63 |
+
forward_time = 0
|
64 |
+
backward_min = float('inf')
|
65 |
+
backward_time = 0
|
66 |
+
for _ in trange(args.runs):
|
67 |
+
correlation_sampler.zero_grad()
|
68 |
+
|
69 |
+
start = time.time()
|
70 |
+
output = correlation_sampler(input1, input2)
|
71 |
+
elapsed = time.time() - start
|
72 |
+
forward_min = min(forward_min, elapsed)
|
73 |
+
forward_time += elapsed
|
74 |
+
output = output.mean()
|
75 |
+
|
76 |
+
start = time.time()
|
77 |
+
(output.mean()).backward()
|
78 |
+
elapsed = time.time() - start
|
79 |
+
backward_min = min(backward_min, elapsed)
|
80 |
+
backward_time += elapsed
|
81 |
+
|
82 |
+
scale = TIME_SCALES[args.scale]
|
83 |
+
forward_min *= scale
|
84 |
+
backward_min *= scale
|
85 |
+
forward_average = forward_time / args.runs * scale
|
86 |
+
backward_average = backward_time / args.runs * scale
|
87 |
+
|
88 |
+
print('Forward: {0:.3f}/{1:.3f} {4} | Backward {2:.3f}/{3:.3f} {4}'.format(
|
89 |
+
forward_min, forward_average, backward_min, backward_average,
|
90 |
+
args.scale))
|
Pytorch-Correlation-extension/check.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import division
|
2 |
+
from __future__ import print_function
|
3 |
+
|
4 |
+
import argparse
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
|
8 |
+
from spatial_correlation_sampler import SpatialCorrelationSampler
|
9 |
+
|
10 |
+
|
11 |
+
def check_equal(first, second, verbose):
|
12 |
+
if verbose:
|
13 |
+
print()
|
14 |
+
for i, (x, y) in enumerate(zip(first, second)):
|
15 |
+
x = x.cpu().detach().numpy()
|
16 |
+
y = y.cpu().detach().numpy()
|
17 |
+
if verbose:
|
18 |
+
print("x = {}".format(x.flatten()))
|
19 |
+
print("y = {}".format(y.flatten()))
|
20 |
+
print('-' * 80)
|
21 |
+
np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i))
|
22 |
+
|
23 |
+
|
24 |
+
def zero_grad(variables):
|
25 |
+
for variable in variables:
|
26 |
+
if variable.grad is not None: variable.grad.zero_()
|
27 |
+
|
28 |
+
|
29 |
+
def get_grads(variables):
|
30 |
+
return [var.grad.clone() for var in variables]
|
31 |
+
|
32 |
+
|
33 |
+
def check_forward(input1, input2, correlation_sampler, verbose, gpu_index=0):
|
34 |
+
device = torch.device(f"cuda:{gpu_index}")
|
35 |
+
|
36 |
+
cpu_values = correlation_sampler(input1, input2)
|
37 |
+
cuda_values = correlation_sampler(input1.to(device), input2.to(device))
|
38 |
+
|
39 |
+
print(f"Forward: CPU vs. CUDA device:{gpu_index} ... ", end='')
|
40 |
+
check_equal(cpu_values, cuda_values, verbose)
|
41 |
+
print('Ok')
|
42 |
+
|
43 |
+
|
44 |
+
def check_backward(input1, input2, correlation_sampler, verbose, gpu_index=0):
|
45 |
+
device = torch.device(f"cuda:{gpu_index}")
|
46 |
+
|
47 |
+
zero_grad([input1, input2])
|
48 |
+
|
49 |
+
cpu_values = correlation_sampler(input1, input2)
|
50 |
+
cpu_values.sum().backward()
|
51 |
+
grad_cpu = get_grads([input1, input2])
|
52 |
+
|
53 |
+
zero_grad([input1, input2])
|
54 |
+
|
55 |
+
cuda_values = correlation_sampler(input1.to(device), input2.to(device))
|
56 |
+
cuda_values.sum().backward()
|
57 |
+
grad_cuda = get_grads([input1, input2])
|
58 |
+
|
59 |
+
print(f"Backward: CPU vs. CUDA device:{gpu_index} ... ", end='')
|
60 |
+
check_equal(grad_cpu, grad_cuda, verbose)
|
61 |
+
print('Ok')
|
62 |
+
|
63 |
+
|
64 |
+
def check_multi_gpu_forward(correlation_sampler, verbose):
|
65 |
+
print("Multi-GPU forward")
|
66 |
+
total_gpus = torch.cuda.device_count()
|
67 |
+
for gpu in range(total_gpus):
|
68 |
+
check_forward(input1, input2, correlation_sampler, verbose, gpu_index=gpu)
|
69 |
+
|
70 |
+
def check_multi_gpu_backward(correlation_sampler, verbose):
|
71 |
+
print("Multi-GPU backward")
|
72 |
+
total_gpus = torch.cuda.device_count()
|
73 |
+
for gpu in range(total_gpus):
|
74 |
+
check_backward(input1, input2, correlation_sampler, verbose, gpu_index=gpu)
|
75 |
+
|
76 |
+
|
77 |
+
parser = argparse.ArgumentParser()
|
78 |
+
parser.add_argument('direction', choices=['forward', 'backward'], nargs='+')
|
79 |
+
parser.add_argument('-b', '--batch-size', type=int, default=1)
|
80 |
+
parser.add_argument('-k', '--kernel-size', type=int, default=3)
|
81 |
+
parser.add_argument('--patch', type=int, default=3)
|
82 |
+
parser.add_argument('--patch_dilation', type=int, default=2)
|
83 |
+
parser.add_argument('-c', '--channel', type=int, default=10)
|
84 |
+
parser.add_argument('--height', type=int, default=10)
|
85 |
+
parser.add_argument('-w', '--width', type=int, default=10)
|
86 |
+
parser.add_argument('-s', '--stride', type=int, default=2)
|
87 |
+
parser.add_argument('-p', '--pad', type=int, default=5)
|
88 |
+
parser.add_argument('-v', '--verbose', action='store_true', default=False)
|
89 |
+
parser.add_argument('-d', '--dilation', type=int, default=2)
|
90 |
+
args = parser.parse_args()
|
91 |
+
print(args)
|
92 |
+
|
93 |
+
assert(torch.cuda.is_available()), "no comparison to make"
|
94 |
+
input1 = torch.randn(args.batch_size,
|
95 |
+
args.channel,
|
96 |
+
args.height,
|
97 |
+
args.width).double()
|
98 |
+
input2 = torch.randn(args.batch_size,
|
99 |
+
args.channel,
|
100 |
+
args.height,
|
101 |
+
args.width).double()
|
102 |
+
input1.requires_grad = True
|
103 |
+
input2.requires_grad = True
|
104 |
+
|
105 |
+
correlation_sampler = SpatialCorrelationSampler(
|
106 |
+
args.kernel_size,
|
107 |
+
args.patch,
|
108 |
+
args.stride,
|
109 |
+
args.pad,
|
110 |
+
args.dilation,
|
111 |
+
args.patch_dilation)
|
112 |
+
|
113 |
+
if 'forward' in args.direction:
|
114 |
+
check_forward(input1, input2, correlation_sampler, args.verbose)
|
115 |
+
if torch.cuda.device_count() > 1: check_multi_gpu_forward(correlation_sampler, args.verbose)
|
116 |
+
|
117 |
+
if 'backward' in args.direction:
|
118 |
+
check_backward(input1, input2, correlation_sampler, args.verbose)
|
119 |
+
if torch.cuda.device_count() > 1: check_multi_gpu_backward(correlation_sampler, args.verbose)
|
Pytorch-Correlation-extension/grad_check.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import torch
|
3 |
+
# torch.set_printoptions(precision=1, threshold=10000)
|
4 |
+
from torch.autograd import gradcheck
|
5 |
+
from spatial_correlation_sampler import SpatialCorrelationSampler
|
6 |
+
|
7 |
+
parser = argparse.ArgumentParser()
|
8 |
+
parser.add_argument('backend', choices=['cpu', 'cuda'], default='cuda')
|
9 |
+
parser.add_argument('-b', '--batch-size', type=int, default=2)
|
10 |
+
parser.add_argument('-k', '--kernel-size', type=int, default=3)
|
11 |
+
parser.add_argument('--patch', type=int, default=3)
|
12 |
+
parser.add_argument('--patch_dilation', type=int, default=2)
|
13 |
+
parser.add_argument('-c', '--channel', type=int, default=2)
|
14 |
+
parser.add_argument('--height', type=int, default=10)
|
15 |
+
parser.add_argument('-w', '--width', type=int, default=10)
|
16 |
+
parser.add_argument('-s', '--stride', type=int, default=2)
|
17 |
+
parser.add_argument('-p', '--pad', type=int, default=1)
|
18 |
+
parser.add_argument('-d', '--dilation', type=int, default=2)
|
19 |
+
|
20 |
+
args = parser.parse_args()
|
21 |
+
|
22 |
+
input1 = torch.randn(args.batch_size,
|
23 |
+
args.channel,
|
24 |
+
args.height,
|
25 |
+
args.width,
|
26 |
+
dtype=torch.float64,
|
27 |
+
device=torch.device(args.backend))
|
28 |
+
input2 = torch.randn(args.batch_size,
|
29 |
+
args.channel,
|
30 |
+
args.height,
|
31 |
+
args.width,
|
32 |
+
dtype=torch.float64,
|
33 |
+
device=torch.device(args.backend))
|
34 |
+
|
35 |
+
input1.requires_grad = True
|
36 |
+
input2.requires_grad = True
|
37 |
+
|
38 |
+
correlation_sampler = SpatialCorrelationSampler(args.kernel_size,
|
39 |
+
args.patch,
|
40 |
+
args.stride,
|
41 |
+
args.pad,
|
42 |
+
args.dilation,
|
43 |
+
args.patch_dilation)
|
44 |
+
|
45 |
+
|
46 |
+
if gradcheck(correlation_sampler, [input1, input2]):
|
47 |
+
print('Ok')
|
Pytorch-Correlation-extension/requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
torch>=1.0.1
|
2 |
+
numpy
|
Pytorch-Correlation-extension/setup.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from setuptools import setup
|
3 |
+
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
|
4 |
+
from os.path import join
|
5 |
+
|
6 |
+
CPU_ONLY = False
|
7 |
+
project_root = 'Correlation_Module'
|
8 |
+
|
9 |
+
source_files = ['correlation.cpp', 'correlation_sampler.cpp']
|
10 |
+
|
11 |
+
cxx_args = ['-std=c++17', '-fopenmp']
|
12 |
+
|
13 |
+
def generate_nvcc_args(gpu_archs):
|
14 |
+
nvcc_args = []
|
15 |
+
for arch in gpu_archs:
|
16 |
+
nvcc_args.extend(['-gencode', f'arch=compute_{arch},code=sm_{arch}'])
|
17 |
+
return nvcc_args
|
18 |
+
|
19 |
+
gpu_arch = os.environ.get('GPU_ARCH', '').split()
|
20 |
+
nvcc_args = generate_nvcc_args(gpu_arch)
|
21 |
+
|
22 |
+
with open("README.md", "r") as fh:
|
23 |
+
long_description = fh.read()
|
24 |
+
|
25 |
+
|
26 |
+
def launch_setup():
|
27 |
+
if CPU_ONLY:
|
28 |
+
Extension = CppExtension
|
29 |
+
macro = []
|
30 |
+
else:
|
31 |
+
Extension = CUDAExtension
|
32 |
+
source_files.append('correlation_cuda_kernel.cu')
|
33 |
+
macro = [("USE_CUDA", None)]
|
34 |
+
|
35 |
+
sources = [join(project_root, file) for file in source_files]
|
36 |
+
|
37 |
+
setup(
|
38 |
+
name='spatial_correlation_sampler',
|
39 |
+
version="0.4.0",
|
40 |
+
author="ClΓ©ment Pinard",
|
41 |
+
author_email="[email protected]",
|
42 |
+
description="Correlation module for pytorch",
|
43 |
+
long_description=long_description,
|
44 |
+
long_description_content_type="text/markdown",
|
45 |
+
url="https://github.com/ClementPinard/Pytorch-Correlation-extension",
|
46 |
+
install_requires=['torch>=1.1', 'numpy'],
|
47 |
+
ext_modules=[
|
48 |
+
Extension('spatial_correlation_sampler_backend',
|
49 |
+
sources,
|
50 |
+
define_macros=macro,
|
51 |
+
extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args},
|
52 |
+
extra_link_args=['-lgomp'])
|
53 |
+
],
|
54 |
+
package_dir={'': project_root},
|
55 |
+
packages=['spatial_correlation_sampler'],
|
56 |
+
cmdclass={
|
57 |
+
'build_ext': BuildExtension
|
58 |
+
},
|
59 |
+
classifiers=[
|
60 |
+
"Programming Language :: Python :: 3",
|
61 |
+
"License :: OSI Approved :: MIT License",
|
62 |
+
"Operating System :: POSIX :: Linux",
|
63 |
+
"Intended Audience :: Science/Research",
|
64 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence"
|
65 |
+
])
|
66 |
+
|
67 |
+
|
68 |
+
if __name__ == '__main__':
|
69 |
+
launch_setup()
|
Pytorch-Correlation-extension/setup_cpu.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import setup
|
2 |
+
|
3 |
+
setup.CPU_ONLY = True
|
4 |
+
setup.launch_setup()
|