Spaces:

aikenml
/

SAM_Track

Runtime error

App Files Files Community

aikenml commited on Dec 9, 2023

Commit

e5f748f

1 Parent(s): 90b65ad

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

Pytorch-Correlation-extension/.gitignore +1 -0
Pytorch-Correlation-extension/Correlation_Module/correlation.cpp +178 -0
Pytorch-Correlation-extension/Correlation_Module/correlation_cuda_kernel.cu +327 -0
Pytorch-Correlation-extension/Correlation_Module/correlation_sampler.cpp +138 -0
Pytorch-Correlation-extension/Correlation_Module/spatial_correlation_sampler/__init__.py +1 -0
Pytorch-Correlation-extension/Correlation_Module/spatial_correlation_sampler/spatial_correlation_sampler.py +107 -0
Pytorch-Correlation-extension/LICENSE +21 -0
Pytorch-Correlation-extension/README.md +155 -0
Pytorch-Correlation-extension/benchmark.py +90 -0
Pytorch-Correlation-extension/check.py +119 -0
Pytorch-Correlation-extension/grad_check.py +47 -0
Pytorch-Correlation-extension/requirements.txt +2 -0
Pytorch-Correlation-extension/setup.py +69 -0
Pytorch-Correlation-extension/setup_cpu.py +4 -0

Pytorch-Correlation-extension/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .egg

Pytorch-Correlation-extension/Correlation_Module/correlation.cpp ADDED Viewed

	@@ -0,0 +1,178 @@

+#include <torch/extension.h>
+using namespace torch;
+#include <vector>
+#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
+template <typename scalar_t>
+static void correlate_patch(
+    TensorAccessor<scalar_t,3> input1,
+    TensorAccessor<scalar_t,3> input2,
+    scalar_t *dst,
+    int kH, int kW,
+    int dilationH, int dilationW,
+    int u, int v,
+    int shiftU, int shiftV){
+  const int C = input1.size(0);
+  const int iH = input1.size(1);
+  const int iW = input1.size(2);
+  for (int c=0; c<C; ++c){
+    for (int i=0; i<kH; ++i){
+      int i1 = u + i * dilationH;
+      int i2 = i1 + shiftU;
+      if WITHIN_BOUNDS(i1, i2, iH, iH){
+        for (int j=0; j<kW; ++j){
+          int j1 = v + j * dilationW;
+          int j2 = j1 + shiftV;
+          if WITHIN_BOUNDS(j1, j2, iW, iW){
+            scalar_t v1 = input1[c][i1][j1];
+            scalar_t v2 = input2[c][i2][j2];
+            *dst += v1 * v2;
+          }
+        }
+      }
+    }
+  }
+}
+template <typename scalar_t>
+static void correlate_patch_grad(
+    TensorAccessor<scalar_t,3> input1,
+    TensorAccessor<scalar_t,3> gradInput1,
+    TensorAccessor<scalar_t,3> input2,
+    TensorAccessor<scalar_t,3> gradInput2,
+    scalar_t gradOutput,
+    int kH, int kW,
+    int dilationH, int dilationW,
+    int u, int v,
+    int shiftU, int shiftV){
+  const int C = input1.size(0);
+  const int iH = input1.size(1);
+  const int iW = input1.size(2);
+  for (int c=0; c<C; ++c){
+    for (int i=0; i<kH; ++i){
+      int i1 = u + i * dilationH;
+      int i2 = i1 + shiftU;
+      if WITHIN_BOUNDS(i1, i2, iH, iH){
+        for (int j=0; j<kW; ++j){
+          int j1 = v + j * dilationW;
+          int j2 = j1 + shiftV;
+          if WITHIN_BOUNDS(j1, j2, iW, iW){
+            scalar_t v1 = input1[c][i1][j1];
+            scalar_t v2 = input2[c][i2][j2];
+            gradInput2[c][i2][j2] += gradOutput * v1;
+            gradInput1[c][i1][j1] += gradOutput * v2;
+          }
+        }
+      }
+    }
+  }
+}
+torch::Tensor correlation_cpp_forward(
+    torch::Tensor input1,
+    torch::Tensor input2,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW) {
+  const auto batch_size = input1.size(0);
+  const auto iH = input1.size(2);
+  const auto iW = input1.size(3);
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+  const int dilatedKH = (kH - 1) * dilationH + 1;
+  const int dilatedKW = (kW - 1) * dilationW + 1;
+  const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
+  const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;
+  auto output = at::zeros({batch_size, patchH, patchW, oH, oW}, input1.options());
+  int n, ph, pw, h, w;
+  #pragma omp parallel for private(n, ph, pw, h, w) collapse(2)
+    for (n = 0; n < batch_size; ++n) {
+      for(ph = 0; ph < patchH; ++ph){
+        for(pw = 0; pw < patchW; ++pw){
+          AT_DISPATCH_FLOATING_TYPES(input1.scalar_type(), "correlation_forward_cpp", ([&] {
+            auto input1_acc = input1.accessor<scalar_t, 4>();
+            auto input2_acc = input2.accessor<scalar_t, 4>();
+            auto output_acc = output.accessor<scalar_t, 5>();
+            for (h = 0; h < oH; ++h) {
+              for (w = 0; w < oW; ++w) {
+                correlate_patch(input1_acc[n],
+                                input2_acc[n],
+                                &output_acc[n][ph][pw][h][w],
+                                kH, kW,
+                                dilationH, dilationW,
+                                -padH + h * dH,
+                                -padW + w * dW,
+                                (ph - patchRadH)  * dilation_patchH,
+                                (pw - patchRadW)  * dilation_patchW);
+              }
+            }
+          }));
+        }
+      }
+    }
+  return output;
+}
+std::vector<torch::Tensor> correlation_cpp_backward(
+    torch::Tensor input1,
+    torch::Tensor input2,
+    torch::Tensor gradOutput,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+  const int oH = gradOutput.size(3);
+  const int oW = gradOutput.size(4);
+  auto gradInput1 = torch::zeros_like(input1);
+  auto gradInput2 = torch::zeros_like(input2);
+  int n, ph, pw, h, w;
+  #pragma omp parallel for private(n, ph, pw, h, w)
+    for (n = 0; n < batch_size; ++n) {
+      AT_DISPATCH_FLOATING_TYPES(input1.scalar_type(), "correlation_backward_cpp", ([&] {
+        auto input1_acc = input1.accessor<scalar_t, 4>();
+        auto gradInput1_acc = gradInput1.accessor<scalar_t, 4>();
+        auto input2_acc = input2.accessor<scalar_t, 4>();
+        auto gradInput2_acc = gradInput2.accessor<scalar_t, 4>();
+        auto gradOutput_acc = gradOutput.accessor<scalar_t, 5>();
+        for(ph = 0; ph < patchH; ++ph){
+          for(pw = 0; pw < patchW; ++pw){
+            for (h = 0; h < oH; ++h) {
+              for (w = 0; w < oW; ++w) {
+                correlate_patch_grad(input1_acc[n], gradInput1_acc[n],
+                                     input2_acc[n], gradInput2_acc[n],
+                                     gradOutput_acc[n][ph][pw][h][w],
+                                     kH, kW,
+                                     dilationH, dilationW,
+                                     -padH + h * dH,
+                                     -padW + w * dW,
+                                     (ph - patchRadH)  * dilation_patchH,
+                                     (pw - patchRadW)  * dilation_patchW);
+              }
+            }
+          }
+        }
+      }));
+    }
+  return {gradInput1, gradInput2};
+}

Pytorch-Correlation-extension/Correlation_Module/correlation_cuda_kernel.cu ADDED Viewed

	@@ -0,0 +1,327 @@

+#include <torch/types.h>
+using namespace torch;
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#include <iostream>
+// Cuda tensor accessor definitions
+// restrict pointer traits piroritize speed over memory consumption
+#define TensorAcc4R PackedTensorAccessor32<scalar_t,4,RestrictPtrTraits>
+#define TensorAcc5R PackedTensorAccessor32<scalar_t,5,RestrictPtrTraits>
+#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
+#define THREADS_FORWARD 32
+#define THREADS_BACKWARD 5
+namespace corr {
+template <typename scalar_t>
+__global__ void correlation_cuda_forward_kernel(
+    const TensorAcc4R rInput1,
+    const TensorAcc4R rInput2,
+    TensorAcc5R output,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW) {
+  const int iH = rInput1.size(1);
+  const int iW = rInput1.size(2);
+  const int C = rInput1.size(3);
+  const int n = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+  const int thread = threadIdx.x;
+  const int start_i = -padH + h * dH;
+  const int start_j = -padW + w * dW;
+  const int patchRadH = dilation_patchH * (patchH - 1) / 2;
+  const int patchRadW = dilation_patchW * (patchW - 1) / 2;
+  __shared__ scalar_t prod_sum[THREADS_FORWARD];
+  for(int ph = 0; ph < patchH; ++ph){
+    int ph_dilated = ph * dilation_patchH - patchRadH;
+    for(int pw = 0; pw < patchW; ++pw){
+      int pw_dilated = pw * dilation_patchW - patchRadW;
+      prod_sum[thread] = 0;
+      for (int i=0; i<kH; ++i){
+        int i1 = start_i + i * dilationH;
+        int i2 = i1 + ph_dilated;
+        if WITHIN_BOUNDS(i1, i2, iH, iH){
+          for (int j=0; j<kW; ++j){
+            int j1 = start_j + j * dilationW;
+            int j2 = j1 + pw_dilated;
+            if WITHIN_BOUNDS(j1, j2, iW, iW){
+              for (int c=thread; c<C; c += THREADS_FORWARD){
+                scalar_t v1 = rInput1[n][i1][j1][c];
+                scalar_t v2 = rInput2[n][i2][j2][c];
+                prod_sum[thread] += v1 * v2;
+              }
+            }
+          }
+        }
+      }
+      // accumulate
+      __syncthreads();
+      if (thread == 0) {
+        scalar_t reduce_sum = 0;
+        for (int index = 0; index < THREADS_FORWARD; ++index) {
+          reduce_sum += prod_sum[index];
+        }
+        output[n][ph][pw][h][w] = reduce_sum;
+      }
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void correlation_cuda_backward_kernel_input1(
+    const TensorAcc5R gradOutput,
+    const TensorAcc4R input2,
+    TensorAcc4R gradInput1,
+    const int kH, const int kW,
+    const int patchH, const int patchW,
+    const int padH, const int padW,
+    const int dilationH, const int dilationW,
+    const int dilation_patchH, const int dilation_patchW,
+    const int dH, const int dW,
+    const int batch) {
+  const int iH = input2.size(2);
+  const int iW = input2.size(3);
+  const int H = gradOutput.size(3);
+  const int W = gradOutput.size(4);
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+  const int n = batch;
+  const int c = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+  const int ph_off = threadIdx.x;
+  const int pw_off = threadIdx.y;
+  const int h_2 = h + padH;
+  const int w_2 = w + padW;
+  const int min_h = h_2 - kH * dilationH;
+  const int min_w = w_2 - kW * dilationW;
+  __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
+  prod_sum[ph_off][pw_off] = 0;
+  for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
+    int i1 = h + dilation_patchH * (ph - patchRadH);
+    for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
+      int j1 = w + dilation_patchW * (pw - patchRadW);
+      if (WITHIN_BOUNDS(i1, j1, iH, iW)){
+        scalar_t val = input2[n][c][i1][j1];
+        for(int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+          int i2 = (h_3)/dH;
+          if (i2 * dH != h_3)
+            continue;
+          for(int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+            int j2 = (w_3) / dW;
+            if(j2 * dW != w_3)
+              continue;
+            if WITHIN_BOUNDS(i2, j2, H, W) {
+              prod_sum[ph_off][pw_off] += gradOutput[n][ph][pw][i2][j2] * val;
+            }
+          }
+        }
+      }
+    }
+  }
+  __syncthreads();
+  if (ph_off == 0 && pw_off == 0){
+    scalar_t reduce_sum =0;
+    for (int ph = 0; ph < THREADS_BACKWARD; ++ph){
+      for (int pw = 0; pw < THREADS_BACKWARD; ++pw){
+        reduce_sum += prod_sum[ph][pw];
+      }
+    }
+    gradInput1[n][c][h][w] = reduce_sum;
+  }
+}
+template <typename scalar_t>
+__global__ void correlation_cuda_backward_kernel_input2(
+    const TensorAcc5R gradOutput,
+    const TensorAcc4R input1,
+    TensorAcc4R gradInput2,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW,
+    int batch) {
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+  const int H = gradOutput.size(3);
+  const int W = gradOutput.size(4);
+  const int dilatedKH = kH * dilationH;
+  const int dilatedKW = kW * dilationW;
+  const int n = batch;
+  const int c = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+  const int ph_off = threadIdx.x;
+  const int pw_off = threadIdx.y;
+  __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
+  prod_sum[ph_off][pw_off] = 0;
+  for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
+    int i1 = h - dilation_patchH * (ph - patchRadH);
+    for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
+      int j1 = w - dilation_patchW * (pw - patchRadW);
+      if WITHIN_BOUNDS(i1, j1, iH, iW) {
+        scalar_t val = input1[n][c][i1][j1];
+        const int h_2 = i1 + padH;
+        const int w_2 = j1 + padW;
+        const int min_h = h_2 - dilatedKH;
+        const int min_w = w_2 - dilatedKW;
+        for(int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+          int i2 = (h_3)/dH;
+          if (i2 * dH != h_3)
+            continue;
+          for(int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+            int j2 = (w_3) / dW;
+            if(j2 * dW != w_3)
+              continue;
+            if WITHIN_BOUNDS(i2, j2, H, W) {
+              prod_sum[ph_off][pw_off] += gradOutput[n][ph][pw][i2][j2] * val;
+            }
+          }
+        }
+      }
+    }
+  }
+  __syncthreads();
+  if (ph_off == 0 && pw_off == 0){
+    scalar_t reduce_sum =0;
+    for (int ph = 0; ph < THREADS_BACKWARD; ++ph){
+      for (int pw = 0; pw < THREADS_BACKWARD; ++pw){
+        reduce_sum += prod_sum[ph][pw];
+      }
+    }
+    gradInput2[n][c][h][w] = reduce_sum;
+  }
+}
+} // namsepace corr
+torch::Tensor correlation_cuda_forward(
+    torch::Tensor input1,
+    torch::Tensor input2,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int dilatedKH = (kH - 1) * dilationH + 1;
+  const int dilatedKW = (kW - 1) * dilationW + 1;
+  const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
+  const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;
+  auto output = torch::zeros({batch_size, patchH, patchW, oH, oW}, input1.options());
+  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
+  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
+  const int threads = THREADS_FORWARD;
+  const dim3 blocks(batch_size, oH, oW);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.scalar_type(), "correlation_forward_cuda", ([&] {
+    TensorAcc4R trInput1_acc  = trInput1.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
+    TensorAcc4R trInput2_acc = trInput2.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
+    TensorAcc5R output_acc = output.packed_accessor32<scalar_t,5,RestrictPtrTraits>();
+    corr::correlation_cuda_forward_kernel<scalar_t><<<blocks, threads>>>(
+        trInput1_acc, trInput2_acc, output_acc,
+        kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
+        dilation_patchH, dilation_patchW, dH, dW);
+  }));
+  return output;
+}
+std::vector<torch::Tensor> correlation_cuda_backward(
+    torch::Tensor input1,
+    torch::Tensor input2,
+    torch::Tensor gradOutput,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW) {
+  auto gradInput1 = torch::zeros_like(input1);
+  auto gradInput2 = torch::zeros_like(input2);
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int C = input1.size(1);
+  const dim3 blocks(C, iH, iW);
+  const dim3 threads(THREADS_BACKWARD, THREADS_BACKWARD);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.scalar_type(), "correlation_backward_cuda", ([&] {
+    TensorAcc4R input1_acc = input1.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
+    TensorAcc4R input2_acc = input2.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
+    TensorAcc4R gradInput1_acc = gradInput1.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
+    TensorAcc4R gradInput2_acc = gradInput2.packed_accessor32<scalar_t,4,RestrictPtrTraits>();
+    TensorAcc5R gradOutput_acc = gradOutput.packed_accessor32<scalar_t,5,RestrictPtrTraits>();
+    for (int n = 0; n < batch_size; ++n){
+      corr::correlation_cuda_backward_kernel_input1<scalar_t><<<blocks, threads>>>(
+          gradOutput_acc, input2_acc, gradInput1_acc,
+          kH, kW, patchH, patchW, padH, padW,
+          dilationH, dilationW,
+          dilation_patchH, dilation_patchW,
+          dH, dW,
+          n);
+    }
+    for (int n = 0; n < batch_size; ++n){
+      corr::correlation_cuda_backward_kernel_input2<scalar_t><<<blocks, threads>>>(
+          gradOutput_acc, input1_acc, gradInput2_acc,
+          kH, kW, patchH, patchW, padH, padW,
+          dilationH, dilationW,
+          dilation_patchH, dilation_patchW,
+          dH, dW,
+          n);
+    }
+  }));
+  return {gradInput1, gradInput2};
+}

Pytorch-Correlation-extension/Correlation_Module/correlation_sampler.cpp ADDED Viewed

	@@ -0,0 +1,138 @@

+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <vector>
+#include <iostream>
+// declarations
+torch::Tensor correlation_cpp_forward(
+    torch::Tensor input1,
+    torch::Tensor input2,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW);
+std::vector<torch::Tensor> correlation_cpp_backward(
+    torch::Tensor grad_output,
+    torch::Tensor input1,
+    torch::Tensor input2,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW);
+#ifdef USE_CUDA
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CHECK_SAME_DEVICE(x, y) TORCH_CHECK(x.device() == y.device(), #x " is not on same device as " #y)
+torch::Tensor correlation_cuda_forward(
+    torch::Tensor input1,
+    torch::Tensor input2,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW);
+std::vector<torch::Tensor> correlation_cuda_backward(
+    torch::Tensor grad_output,
+    torch::Tensor input1,
+    torch::Tensor input2,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW);
+// C++ interface
+torch::Tensor correlation_sample_forward(
+    torch::Tensor input1,
+    torch::Tensor input2,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW) {
+  if (input1.device().is_cuda()){
+    CHECK_INPUT(input1);
+    CHECK_INPUT(input2);
+    // set device of input1 as default CUDA device
+    // https://pytorch.org/cppdocs/api/structc10_1_1cuda_1_1_optional_c_u_d_a_guard.html
+    const at::cuda::OptionalCUDAGuard guard_input1(device_of(input1));
+    CHECK_SAME_DEVICE(input1, input2);
+    return correlation_cuda_forward(input1, input2, kH, kW, patchH, patchW,
+                             padH, padW, dilationH, dilationW,
+                             dilation_patchH, dilation_patchW,
+                             dH, dW);
+  }else{
+    return correlation_cpp_forward(input1, input2, kH, kW, patchH, patchW,
+                             padH, padW, dilationH, dilationW,
+                             dilation_patchH, dilation_patchW,
+                             dH, dW);
+  }
+}
+std::vector<torch::Tensor> correlation_sample_backward(
+    torch::Tensor input1,
+    torch::Tensor input2,
+    torch::Tensor grad_output,
+    int kH, int kW,
+    int patchH, int patchW,
+    int padH, int padW,
+    int dilationH, int dilationW,
+    int dilation_patchH, int dilation_patchW,
+    int dH, int dW) {
+  if(grad_output.device().is_cuda()){
+    CHECK_INPUT(input1);
+    CHECK_INPUT(input2);
+    // set device of input1 as default CUDA device
+    const at::cuda::OptionalCUDAGuard guard_input1(device_of(input1));
+    CHECK_SAME_DEVICE(input1, input2);
+    CHECK_SAME_DEVICE(input1, grad_output);
+    return correlation_cuda_backward(input1, input2, grad_output,
+                              kH, kW, patchH, patchW,
+                              padH, padW,
+                              dilationH, dilationW,
+                              dilation_patchH, dilation_patchW,
+                              dH, dW);
+  }else{
+    return correlation_cpp_backward(
+                              input1, input2, grad_output,
+                              kH, kW, patchH, patchW,
+                              padH, padW,
+                              dilationH, dilationW,
+                              dilation_patchH, dilation_patchW,
+                              dH, dW);
+  }
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &correlation_sample_forward, "Spatial Correlation Sampler Forward");
+  m.def("backward", &correlation_sample_backward, "Spatial Correlation Sampler backward");
+}
+#else
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &correlation_cpp_forward, "Spatial Correlation Sampler Forward");
+  m.def("backward", &correlation_cpp_backward, "Spatial Correlation Sampler backward");
+}
+#endif

Pytorch-Correlation-extension/Correlation_Module/spatial_correlation_sampler/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .spatial_correlation_sampler import SpatialCorrelationSampler, spatial_correlation_sample

Pytorch-Correlation-extension/Correlation_Module/spatial_correlation_sampler/spatial_correlation_sampler.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+import spatial_correlation_sampler_backend as correlation
+def spatial_correlation_sample(input1,
+                               input2,
+                               kernel_size=1,
+                               patch_size=1,
+                               stride=1,
+                               padding=0,
+                               dilation=1,
+                               dilation_patch=1):
+    """Apply spatial correlation sampling on from input1 to input2,
+    Every parameter except input1 and input2 can be either single int
+    or a pair of int. For more information about Spatial Correlation
+    Sampling, see this page.
+    https://lmb.informatik.uni-freiburg.de/Publications/2015/DFIB15/
+    Args:
+        input1 : The first parameter.
+        input2 : The second parameter.
+        kernel_size : total size of your correlation kernel, in pixels
+        patch_size : total size of your patch, determining how many
+            different shifts will be applied
+        stride : stride of the spatial sampler, will modify output
+            height and width
+        padding : padding applied to input1 and input2 before applying
+            the correlation sampling, will modify output height and width
+        dilation_patch : step for every shift in patch
+    Returns:
+        Tensor: Result of correlation sampling
+    """
+    return SpatialCorrelationSamplerFunction.apply(input1, input2,
+                                                   kernel_size, patch_size,
+                                                   stride, padding, dilation, dilation_patch)
+class SpatialCorrelationSamplerFunction(Function):
+    @staticmethod
+    def forward(ctx,
+                input1,
+                input2,
+                kernel_size=1,
+                patch_size=1,
+                stride=1,
+                padding=0,
+                dilation=1,
+                dilation_patch=1):
+        ctx.save_for_backward(input1, input2)
+        kH, kW = ctx.kernel_size = _pair(kernel_size)
+        patchH, patchW = ctx.patch_size = _pair(patch_size)
+        padH, padW = ctx.padding = _pair(padding)
+        dilationH, dilationW = ctx.dilation = _pair(dilation)
+        dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(dilation_patch)
+        dH, dW = ctx.stride = _pair(stride)
+        output = correlation.forward(input1, input2,
+                                     kH, kW, patchH, patchW,
+                                     padH, padW, dilationH, dilationW,
+                                     dilation_patchH, dilation_patchW,
+                                     dH, dW)
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input1, input2 = ctx.saved_variables
+        kH, kW = ctx.kernel_size
+        patchH, patchW = ctx.patch_size
+        padH, padW = ctx.padding
+        dilationH, dilationW = ctx.dilation
+        dilation_patchH, dilation_patchW = ctx.dilation_patch
+        dH, dW = ctx.stride
+        grad_input1, grad_input2 = correlation.backward(input1, input2, grad_output,
+                                                        kH, kW, patchH, patchW,
+                                                        padH, padW, dilationH, dilationW,
+                                                        dilation_patchH, dilation_patchW,
+                                                        dH, dW)
+        return grad_input1, grad_input2, None, None, None, None, None, None
+class SpatialCorrelationSampler(nn.Module):
+    def __init__(self, kernel_size=1, patch_size=1, stride=1, padding=0, dilation=1, dilation_patch=1):
+        super(SpatialCorrelationSampler, self).__init__()
+        self.kernel_size = kernel_size
+        self.patch_size = patch_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.dilation_patch = dilation_patch
+    def forward(self, input1, input2):
+        return SpatialCorrelationSamplerFunction.apply(input1, input2, self.kernel_size,
+                                                       self.patch_size, self.stride,
+                                                       self.padding, self.dilation, self.dilation_patch)

Pytorch-Correlation-extension/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) [year] [fullname]
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Pytorch-Correlation-extension/README.md ADDED Viewed

	@@ -0,0 +1,155 @@

+[![PyPI](https://img.shields.io/pypi/v/spatial-correlation-sampler.svg)](https://pypi.org/project/spatial-correlation-sampler/)
+# Pytorch Correlation module
+this is a custom C++/Cuda implementation of Correlation module, used e.g. in [FlowNetC](https://arxiv.org/abs/1504.06852)
+This [tutorial](http://pytorch.org/tutorials/advanced/cpp_extension.html) was used as a basis for implementation, as well as
+[NVIDIA's cuda code](https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)
+- Build and Install C++ and CUDA extensions by executing `python setup.py install`,
+- Benchmark C++ vs. CUDA by running `python benchmark.py {cpu, cuda}`,
+- Run gradient checks on the code by running `python grad_check.py --backend {cpu, cuda}`.
+# Requirements
+This module is expected to compile for Pytorch `2.1.0`.
+Before installation please check compatibility of your GPU and CUDA (_Compute Capability_) [nvidia docs](https://developer.nvidia.com/cuda-gpus).
+e.g RTX 6000 is using CC=8.9 so we are setting the environment variable to
+`export TORCH_CUDA_ARCH_LIST="8.9+PTX"`
+# Installation
+be reminded this module requires `python3-dev` to compile C++ code, e.g. on Ubuntu run:
+`apt install python3-dev`
+this module is available on pip
+`pip install spatial-correlation-sampler`
+For a cpu-only version, you can install from source with
+`python setup_cpu.py install`
+# Known Problems
+This module needs compatible gcc version and CUDA to be compiled.
+Namely, CUDA 9.1 and below will need gcc5, while CUDA 9.2 and 10.0 will need gcc7
+See [this issue](https://github.com/ClementPinard/Pytorch-Correlation-extension/issues/1) for more information
+# Usage
+API has a few difference with NVIDIA's module
+ * output is now a 5D tensor, which reflects the shifts horizontal and vertical.
+ ```
+input (B x C x H x W) -> output (B x PatchH x PatchW x oH x oW)
+ ```
+ * Output sizes `oH` and `oW` are no longer dependant of patch size, but only of kernel size and padding
+ * Patch size `patch_size` is now the whole patch, and not only the radii.
+ * `stride1` is now `stride` and`stride2` is `dilation_patch`, which behave like dilated convolutions
+ * equivalent `max_displacement` is then `dilation_patch * (patch_size - 1) / 2`.
+ * `dilation` is a new parameter, it acts the same way as dilated convolution regarding the correlation kernel
+ * to get the right parameters for FlowNetC, you would have
+ ```
+kernel_size=1
+patch_size=21,
+stride=1,
+padding=0,
+dilation=1
+dilation_patch=2
+ ```
+## Example
+```python
+import torch
+from spatial_correlation_sampler import SpatialCorrelationSampler,
+device = "cuda"
+batch_size = 1
+channel = 1
+H = 10
+W = 10
+dtype = torch.float32
+input1 = torch.randint(1, 4, (batch_size, channel, H, W), dtype=dtype, device=device, requires_grad=True)
+input2 = torch.randint_like(input1, 1, 4).requires_grad_(True)
+#You can either use the function or the module. Note that the module doesn't contain any parameter tensor.
+#function
+out = spatial_correlation_sample(input1,
+	                         input2,
+                                 kernel_size=3,
+                                 patch_size=1,
+                                 stride=2,
+                                 padding=0,
+                                 dilation=2,
+                                 dilation_patch=1)
+#module
+correlation_sampler = SpatialCorrelationSampler(
+    kernel_size=3,
+    patch_size=1,
+    stride=2,
+    padding=0,
+    dilation=2,
+    dilation_patch=1)
+out = correlation_sampler(input1, input2)
+```
+# Benchmark
+ * default parameters are from `benchmark.py`, FlowNetC parameters are same as use in `FlowNetC` with a batch size of 4, described in [this paper](https://arxiv.org/abs/1504.06852), implemented [here](https://github.com/lmb-freiburg/flownet2) and [here](https://github.com/NVIDIA/flownet2-pytorch/blob/master/networks/FlowNetC.py).
+ * Feel free to file an issue to add entries to this with your hardware !
+## CUDA Benchmark
+ * See [here](https://gist.github.com/ClementPinard/270e910147119831014932f67fb1b5ea) for a benchmark script working with [NVIDIA](https://github.com/NVIDIA/flownet2-pytorch/tree/master/networks/correlation_package)'s code, and Pytorch.
+ * Benchmark are launched with environment variable `CUDA_LAUNCH_BLOCKING` set to `1`.
+ * Only `float32` is benchmarked.
+ * FlowNetC correlation parameters where launched with the following command:
+ ```bash
+ CUDA_LAUNCH_BLOCKING=1 python benchmark.py --scale ms -k1 --patch 21 -s1 -p0 --patch_dilation 2 -b4 --height 48 --width 64 -c256 cuda -d float
+ CUDA_LAUNCH_BLOCKING=1 python NV_correlation_benchmark.py --scale ms -k1 --patch 21 -s1 -p0 --patch_dilation 2 -b4 --height 48 --width 64 -c256
+ ```
+ | implementation | Correlation parameters |  device |     pass |      min time |      avg time |
+ | -------------- | ---------------------- | ------- | -------- | ------------: | ------------: |
+ |           ours |                default | 980 GTX |  forward |  **5.745 ms** |  **5.851 ms** |
+ |           ours |                default | 980 GTX | backward |     77.694 ms |     77.957 ms |
+ |         NVIDIA |                default | 980 GTX |  forward |     13.779 ms |     13.853 ms |
+ |         NVIDIA |                default | 980 GTX | backward | **73.383 ms** | **73.708 ms** |
+ |                |                        |         |          |               |               |
+ |           ours |               FlowNetC | 980 GTX |  forward |  **26.102 ms** |  **26.179 ms** |
+ |           ours |               FlowNetC | 980 GTX | backward | **208.091 ms** | **208.510 ms** |
+ |         NVIDIA |               FlowNetC | 980 GTX |  forward |      35.363 ms |      35.550 ms |
+ |         NVIDIA |               FlowNetC | 980 GTX | backward |     283.748 ms |     284.346 ms |
+### Notes
+ * The overhead of our implementation regarding `kernel_size` > 1 during backward needs some investigation, feel free to
+ dive in the code to improve it !
+ * The backward pass of NVIDIA is not entirely correct when stride1 > 1 and kernel_size > 1, because not everything
+ is computed, see [here](https://github.com/NVIDIA/flownet2-pytorch/blob/master/networks/correlation_package/src/correlation_cuda_kernel.cu#L120).
+## CPU Benchmark
+  * No other implementation is avalaible on CPU.
+  * It is obviously not recommended to run it on CPU if you have a GPU.
+ | Correlation parameters |               device |     pass |    min time |    avg time |
+ | ---------------------- | -------------------- | -------- | ----------: | ----------: |
+ |                default | E5-2630 v3 @ 2.40GHz |  forward |  159.616 ms |  188.727 ms |
+ |                default | E5-2630 v3 @ 2.40GHz | backward |  282.641 ms |  294.194 ms |
+ |               FlowNetC | E5-2630 v3 @ 2.40GHz |  forward |  2.138 s |  2.144 s |
+ |               FlowNetC | E5-2630 v3 @ 2.40GHz | backward | 7.006 s | 7.075 s |

Pytorch-Correlation-extension/benchmark.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from __future__ import division
+from __future__ import print_function
+import argparse
+import time
+import torch
+from spatial_correlation_sampler import SpatialCorrelationSampler
+from tqdm import trange
+TIME_SCALES = {'s': 1, 'ms': 1000, 'us': 1000000}
+parser = argparse.ArgumentParser()
+parser.add_argument('backend', choices=['cpu', 'cuda'], default='cuda')
+parser.add_argument('-b', '--batch-size', type=int, default=16)
+parser.add_argument('-k', '--kernel-size', type=int, default=3)
+parser.add_argument('--patch', type=int, default=3)
+parser.add_argument('--patch_dilation', type=int, default=2)
+parser.add_argument('-c', '--channel', type=int, default=64)
+parser.add_argument('--height', type=int, default=100)
+parser.add_argument('-w', '--width', type=int, default=100)
+parser.add_argument('-s', '--stride', type=int, default=2)
+parser.add_argument('-p', '--pad', type=int, default=1)
+parser.add_argument('--scale', choices=['s', 'ms', 'us'], default='us')
+parser.add_argument('-r', '--runs', type=int, default=100)
+parser.add_argument('--dilation', type=int, default=2)
+parser.add_argument('-d', '--dtype', choices=['half', 'float', 'double'])
+args = parser.parse_args()
+device = torch.device(args.backend)
+if args.dtype == 'half':
+    dtype = torch.float16
+elif args.dtype == 'float':
+    dtype = torch.float32
+else:
+    dtype = torch.float64
+input1 = torch.randn(args.batch_size,
+                     args.channel,
+                     args.height,
+                     args.width,
+                     dtype=dtype,
+                     device=device,
+                     requires_grad=True)
+input2 = torch.randn_like(input1)
+correlation_sampler = SpatialCorrelationSampler(
+    args.kernel_size,
+    args.patch,
+    args.stride,
+    args.pad,
+    args.dilation,
+    args.patch_dilation)
+# Force CUDA initialization
+output = correlation_sampler(input1, input2)
+print(output.size())
+output.mean().backward()
+forward_min = float('inf')
+forward_time = 0
+backward_min = float('inf')
+backward_time = 0
+for _ in trange(args.runs):
+    correlation_sampler.zero_grad()
+    start = time.time()
+    output = correlation_sampler(input1, input2)
+    elapsed = time.time() - start
+    forward_min = min(forward_min, elapsed)
+    forward_time += elapsed
+    output = output.mean()
+    start = time.time()
+    (output.mean()).backward()
+    elapsed = time.time() - start
+    backward_min = min(backward_min, elapsed)
+    backward_time += elapsed
+scale = TIME_SCALES[args.scale]
+forward_min *= scale
+backward_min *= scale
+forward_average = forward_time / args.runs * scale
+backward_average = backward_time / args.runs * scale
+print('Forward: {0:.3f}/{1:.3f} {4} | Backward {2:.3f}/{3:.3f} {4}'.format(
+    forward_min, forward_average, backward_min, backward_average,
+    args.scale))

Pytorch-Correlation-extension/check.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from __future__ import division
+from __future__ import print_function
+import argparse
+import numpy as np
+import torch
+from spatial_correlation_sampler import SpatialCorrelationSampler
+def check_equal(first, second, verbose):
+    if verbose:
+        print()
+    for i, (x, y) in enumerate(zip(first, second)):
+        x = x.cpu().detach().numpy()
+        y = y.cpu().detach().numpy()
+        if verbose:
+            print("x = {}".format(x.flatten()))
+            print("y = {}".format(y.flatten()))
+            print('-' * 80)
+        np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i))
+def zero_grad(variables):
+    for variable in variables:
+        if variable.grad is not None: variable.grad.zero_()
+def get_grads(variables):
+    return [var.grad.clone() for var in variables]
+def check_forward(input1, input2, correlation_sampler, verbose, gpu_index=0):
+    device = torch.device(f"cuda:{gpu_index}")
+    cpu_values = correlation_sampler(input1, input2)
+    cuda_values = correlation_sampler(input1.to(device), input2.to(device))
+    print(f"Forward: CPU vs. CUDA device:{gpu_index} ... ", end='')
+    check_equal(cpu_values, cuda_values, verbose)
+    print('Ok')
+def check_backward(input1, input2, correlation_sampler, verbose, gpu_index=0):
+    device = torch.device(f"cuda:{gpu_index}")
+    zero_grad([input1, input2])
+    cpu_values = correlation_sampler(input1, input2)
+    cpu_values.sum().backward()
+    grad_cpu = get_grads([input1, input2])
+    zero_grad([input1, input2])
+    cuda_values = correlation_sampler(input1.to(device), input2.to(device))
+    cuda_values.sum().backward()
+    grad_cuda = get_grads([input1, input2])
+    print(f"Backward: CPU vs. CUDA device:{gpu_index} ... ", end='')
+    check_equal(grad_cpu, grad_cuda, verbose)
+    print('Ok')
+def check_multi_gpu_forward(correlation_sampler, verbose):
+    print("Multi-GPU forward")
+    total_gpus = torch.cuda.device_count()
+    for gpu in range(total_gpus):
+        check_forward(input1, input2, correlation_sampler, verbose, gpu_index=gpu)
+def check_multi_gpu_backward(correlation_sampler, verbose):
+    print("Multi-GPU backward")
+    total_gpus = torch.cuda.device_count()
+    for gpu in range(total_gpus):
+        check_backward(input1, input2, correlation_sampler, verbose, gpu_index=gpu)
+parser = argparse.ArgumentParser()
+parser.add_argument('direction', choices=['forward', 'backward'], nargs='+')
+parser.add_argument('-b', '--batch-size', type=int, default=1)
+parser.add_argument('-k', '--kernel-size', type=int, default=3)
+parser.add_argument('--patch', type=int, default=3)
+parser.add_argument('--patch_dilation', type=int, default=2)
+parser.add_argument('-c', '--channel', type=int, default=10)
+parser.add_argument('--height', type=int, default=10)
+parser.add_argument('-w', '--width', type=int, default=10)
+parser.add_argument('-s', '--stride', type=int, default=2)
+parser.add_argument('-p', '--pad', type=int, default=5)
+parser.add_argument('-v', '--verbose', action='store_true', default=False)
+parser.add_argument('-d', '--dilation', type=int, default=2)
+args = parser.parse_args()
+print(args)
+assert(torch.cuda.is_available()), "no comparison to make"
+input1 = torch.randn(args.batch_size,
+                     args.channel,
+                     args.height,
+                     args.width).double()
+input2 = torch.randn(args.batch_size,
+                     args.channel,
+                     args.height,
+                     args.width).double()
+input1.requires_grad = True
+input2.requires_grad = True
+correlation_sampler = SpatialCorrelationSampler(
+    args.kernel_size,
+    args.patch,
+    args.stride,
+    args.pad,
+    args.dilation,
+    args.patch_dilation)
+if 'forward' in args.direction:
+    check_forward(input1, input2, correlation_sampler, args.verbose)
+    if torch.cuda.device_count() > 1: check_multi_gpu_forward(correlation_sampler, args.verbose)
+if 'backward' in args.direction:
+    check_backward(input1, input2, correlation_sampler, args.verbose)
+    if torch.cuda.device_count() > 1: check_multi_gpu_backward(correlation_sampler, args.verbose)

Pytorch-Correlation-extension/grad_check.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import argparse
+import torch
+# torch.set_printoptions(precision=1, threshold=10000)
+from torch.autograd import gradcheck
+from spatial_correlation_sampler import SpatialCorrelationSampler
+parser = argparse.ArgumentParser()
+parser.add_argument('backend', choices=['cpu', 'cuda'], default='cuda')
+parser.add_argument('-b', '--batch-size', type=int, default=2)
+parser.add_argument('-k', '--kernel-size', type=int, default=3)
+parser.add_argument('--patch', type=int, default=3)
+parser.add_argument('--patch_dilation', type=int, default=2)
+parser.add_argument('-c', '--channel', type=int, default=2)
+parser.add_argument('--height', type=int, default=10)
+parser.add_argument('-w', '--width', type=int, default=10)
+parser.add_argument('-s', '--stride', type=int, default=2)
+parser.add_argument('-p', '--pad', type=int, default=1)
+parser.add_argument('-d', '--dilation', type=int, default=2)
+args = parser.parse_args()
+input1 = torch.randn(args.batch_size,
+                     args.channel,
+                     args.height,
+                     args.width,
+                     dtype=torch.float64,
+                     device=torch.device(args.backend))
+input2 = torch.randn(args.batch_size,
+                     args.channel,
+                     args.height,
+                     args.width,
+                     dtype=torch.float64,
+                     device=torch.device(args.backend))
+input1.requires_grad = True
+input2.requires_grad = True
+correlation_sampler = SpatialCorrelationSampler(args.kernel_size,
+                                                args.patch,
+                                                args.stride,
+                                                args.pad,
+                                                args.dilation,
+                                                args.patch_dilation)
+if gradcheck(correlation_sampler, [input1, input2]):
+    print('Ok')

Pytorch-Correlation-extension/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch>=1.0.1
2	+ numpy

Pytorch-Correlation-extension/setup.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
+from os.path import join
+CPU_ONLY = False
+project_root = 'Correlation_Module'
+source_files = ['correlation.cpp', 'correlation_sampler.cpp']
+cxx_args = ['-std=c++17', '-fopenmp']
+def generate_nvcc_args(gpu_archs):
+    nvcc_args = []
+    for arch in gpu_archs:
+        nvcc_args.extend(['-gencode', f'arch=compute_{arch},code=sm_{arch}'])
+    return nvcc_args
+gpu_arch = os.environ.get('GPU_ARCH', '').split()
+nvcc_args = generate_nvcc_args(gpu_arch)
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+def launch_setup():
+    if CPU_ONLY:
+        Extension = CppExtension
+        macro = []
+    else:
+        Extension = CUDAExtension
+        source_files.append('correlation_cuda_kernel.cu')
+        macro = [("USE_CUDA", None)]
+    sources = [join(project_root, file) for file in source_files]
+    setup(
+        name='spatial_correlation_sampler',
+        version="0.4.0",
+        author="Clément Pinard",
+        author_email="[email protected]",
+        description="Correlation module for pytorch",
+        long_description=long_description,
+        long_description_content_type="text/markdown",
+        url="https://github.com/ClementPinard/Pytorch-Correlation-extension",
+        install_requires=['torch>=1.1', 'numpy'],
+        ext_modules=[
+            Extension('spatial_correlation_sampler_backend',
+                      sources,
+                      define_macros=macro,
+                      extra_compile_args={'cxx': cxx_args, 'nvcc': nvcc_args},
+                      extra_link_args=['-lgomp'])
+        ],
+        package_dir={'': project_root},
+        packages=['spatial_correlation_sampler'],
+        cmdclass={
+            'build_ext': BuildExtension
+        },
+        classifiers=[
+            "Programming Language :: Python :: 3",
+            "License :: OSI Approved :: MIT License",
+            "Operating System :: POSIX :: Linux",
+            "Intended Audience :: Science/Research",
+            "Topic :: Scientific/Engineering :: Artificial Intelligence"
+        ])
+if __name__ == '__main__':
+    launch_setup()

Pytorch-Correlation-extension/setup_cpu.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import setup
+setup.CPU_ONLY = True
+setup.launch_setup()