Commit 01994c93 authored by Rodger Combs's avatar Rodger Combs Committed by Timo Rothenpieler

build: add support for building CUDA files with clang

This avoids using the CUDA SDK at all; instead, we provide a minimal
reimplementation of the basic functionality that lavfi actually uses.
It generates very similar code to what NVCC produces.

The header contains no implementation code derived from the SDK.
The function and type declarations are derived from the SDK only to the
extent required to build a compatible implementation. This is generally
accepted to qualify as fair use.

Because this option does not require the proprietary SDK, it does not require
the "--enable-nonfree" flag in configure.
Signed-off-by: 's avatarTimo Rothenpieler <timo@rothenpieler.org>
parent ad97be9f
...@@ -36,6 +36,7 @@ version 4.2: ...@@ -36,6 +36,7 @@ version 4.2:
- derain filter - derain filter
- deesser filter - deesser filter
- mov muxer writes tracks with unspecified language instead of English by default - mov muxer writes tracks with unspecified language instead of English by default
- add support for using clang to compile CUDA kernels
version 4.1: version 4.1:
......
/*
* Minimum CUDA compatibility definitions header
*
* Copyright (c) 2019 Rodger Combs
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AV_COMPAT_CUDA_CUDA_RUNTIME_H
#define AV_COMPAT_CUDA_CUDA_RUNTIME_H
// Common macros
#define __global__ __attribute__((global))
#define __device__ __attribute__((device))
#define __device_builtin__ __attribute__((device_builtin))
#define __align__(N) __attribute__((aligned(N)))
#define __inline__ __inline__ __attribute__((always_inline))
#define max(a, b) ((a) > (b) ? (a) : (b))
#define min(a, b) ((a) < (b) ? (a) : (b))
#define abs(x) ((x) < 0 ? -(x) : (x))
#define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST))
// Basic typedefs
typedef __device_builtin__ unsigned long long cudaTextureObject_t;
typedef struct __device_builtin__ __align__(2) uchar2
{
unsigned char x, y;
} uchar2;
typedef struct __device_builtin__ __align__(4) ushort2
{
unsigned short x, y;
} ushort2;
typedef struct __device_builtin__ uint3
{
unsigned int x, y, z;
} uint3;
typedef struct uint3 dim3;
typedef struct __device_builtin__ __align__(8) int2
{
int x, y;
} int2;
typedef struct __device_builtin__ __align__(4) uchar4
{
unsigned char x, y, z, w;
} uchar4;
typedef struct __device_builtin__ __align__(8) ushort4
{
unsigned char x, y, z, w;
} ushort4;
typedef struct __device_builtin__ __align__(16) int4
{
int x, y, z, w;
} int4;
// Accessors for special registers
#define GETCOMP(reg, comp) \
asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \
ret.comp = tmp;
#define GET(name, reg) static inline __device__ uint3 name() {\
uint3 ret; \
unsigned tmp; \
GETCOMP(reg, x) \
GETCOMP(reg, y) \
GETCOMP(reg, z) \
return ret; \
}
GET(getBlockIdx, ctaid)
GET(getBlockDim, ntid)
GET(getThreadIdx, tid)
// Instead of externs for these registers, we turn access to them into calls into trivial ASM
#define blockIdx (getBlockIdx())
#define blockDim (getBlockDim())
#define threadIdx (getThreadIdx())
// Basic initializers (simple macros rather than inline functions)
#define make_uchar2(a, b) ((uchar2){.x = a, .y = b})
#define make_ushort2(a, b) ((ushort2){.x = a, .y = b})
#define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})
#define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})
// Conversions from the tex instruction's 4-register output to various types
#define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}
TEX2D(unsigned char, a & 0xFF)
TEX2D(unsigned short, a & 0xFFFF)
TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF))
TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF))
TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF))
TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF))
// Template calling tex instruction and converting the output to the selected type
template <class T>
static inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
{
T ret;
unsigned ret1, ret2, ret3, ret4;
asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
"=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) :
"l"(texObject), "f"(x), "f"(y));
conv(&ret, ret1, ret2, ret3, ret4);
return ret;
}
#endif
...@@ -322,6 +322,7 @@ External library support: ...@@ -322,6 +322,7 @@ External library support:
--disable-amf disable AMF video encoding code [autodetect] --disable-amf disable AMF video encoding code [autodetect]
--disable-audiotoolbox disable Apple AudioToolbox code [autodetect] --disable-audiotoolbox disable Apple AudioToolbox code [autodetect]
--enable-cuda-nvcc enable Nvidia CUDA compiler [no] --enable-cuda-nvcc enable Nvidia CUDA compiler [no]
--disable-cuda-llvm disable CUDA compilation using clang [autodetect]
--disable-cuvid disable Nvidia CUVID support [autodetect] --disable-cuvid disable Nvidia CUVID support [autodetect]
--disable-d3d11va disable Microsoft Direct3D 11 video acceleration code [autodetect] --disable-d3d11va disable Microsoft Direct3D 11 video acceleration code [autodetect]
--disable-dxva2 disable Microsoft DirectX 9 video acceleration code [autodetect] --disable-dxva2 disable Microsoft DirectX 9 video acceleration code [autodetect]
...@@ -370,7 +371,7 @@ Toolchain options: ...@@ -370,7 +371,7 @@ Toolchain options:
--cxx=CXX use C compiler CXX [$cxx_default] --cxx=CXX use C compiler CXX [$cxx_default]
--objcc=OCC use ObjC compiler OCC [$cc_default] --objcc=OCC use ObjC compiler OCC [$cc_default]
--dep-cc=DEPCC use dependency generator DEPCC [$cc_default] --dep-cc=DEPCC use dependency generator DEPCC [$cc_default]
--nvcc=NVCC use Nvidia CUDA compiler NVCC [$nvcc_default] --nvcc=NVCC use Nvidia CUDA compiler NVCC or clang [$nvcc_default]
--ld=LD use linker LD [$ld_default] --ld=LD use linker LD [$ld_default]
--pkg-config=PKGCONFIG use pkg-config tool PKGCONFIG [$pkg_config_default] --pkg-config=PKGCONFIG use pkg-config tool PKGCONFIG [$pkg_config_default]
--pkg-config-flags=FLAGS pass additional flags to pkgconf [] --pkg-config-flags=FLAGS pass additional flags to pkgconf []
...@@ -1038,12 +1039,16 @@ test_nvcc(){ ...@@ -1038,12 +1039,16 @@ test_nvcc(){
tmpcu_=$TMPCU tmpcu_=$TMPCU
tmpo_=$TMPO tmpo_=$TMPO
[ -x "$(command -v cygpath)" ] && tmpcu_=$(cygpath -m $tmpcu_) && tmpo_=$(cygpath -m $tmpo_) [ -x "$(command -v cygpath)" ] && tmpcu_=$(cygpath -m $tmpcu_) && tmpo_=$(cygpath -m $tmpo_)
test_cmd $nvcc -ptx $NVCCFLAGS "$@" $NVCC_C $(nvcc_o $tmpo_) $tmpcu_ test_cmd $nvcc $nvccflags "$@" $NVCC_C $(nvcc_o $tmpo_) $tmpcu_
} }
check_nvcc() { check_nvcc() {
log check_nvcc "$@" log check_nvcc "$@"
test_nvcc <<EOF name=$1
shift 1
disabled $name && return
disable $name
test_nvcc "$@" <<EOF && enable $name
extern "C" { extern "C" {
__global__ void hello(unsigned char *data) {} __global__ void hello(unsigned char *data) {}
} }
...@@ -1814,6 +1819,7 @@ HWACCEL_AUTODETECT_LIBRARY_LIST=" ...@@ -1814,6 +1819,7 @@ HWACCEL_AUTODETECT_LIBRARY_LIST="
audiotoolbox audiotoolbox
crystalhd crystalhd
cuda cuda
cuda_llvm
cuvid cuvid
d3d11va d3d11va
dxva2 dxva2
...@@ -2987,8 +2993,10 @@ v4l2_m2m_deps="linux_videodev2_h sem_timedwait" ...@@ -2987,8 +2993,10 @@ v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
hwupload_cuda_filter_deps="ffnvcodec" hwupload_cuda_filter_deps="ffnvcodec"
scale_npp_filter_deps="ffnvcodec libnpp" scale_npp_filter_deps="ffnvcodec libnpp"
scale_cuda_filter_deps="ffnvcodec cuda_nvcc" scale_cuda_filter_deps="ffnvcodec"
thumbnail_cuda_filter_deps="ffnvcodec cuda_nvcc" scale_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
thumbnail_cuda_filter_deps="ffnvcodec"
thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
transpose_npp_filter_deps="ffnvcodec libnpp" transpose_npp_filter_deps="ffnvcodec libnpp"
amf_deps_any="libdl LoadLibrary" amf_deps_any="libdl LoadLibrary"
...@@ -3547,7 +3555,8 @@ zscale_filter_deps="libzimg const_nan" ...@@ -3547,7 +3555,8 @@ zscale_filter_deps="libzimg const_nan"
scale_vaapi_filter_deps="vaapi" scale_vaapi_filter_deps="vaapi"
vpp_qsv_filter_deps="libmfx" vpp_qsv_filter_deps="libmfx"
vpp_qsv_filter_select="qsvvpp" vpp_qsv_filter_select="qsvvpp"
yadif_cuda_filter_deps="ffnvcodec cuda_nvcc" yadif_cuda_filter_deps="ffnvcodec"
yadif_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
# examples # examples
avio_dir_cmd_deps="avformat avutil" avio_dir_cmd_deps="avformat avutil"
...@@ -3651,8 +3660,6 @@ version_script='--version-script' ...@@ -3651,8 +3660,6 @@ version_script='--version-script'
objformat="elf32" objformat="elf32"
x86asmexe_default="nasm" x86asmexe_default="nasm"
windres_default="windres" windres_default="windres"
nvcc_default="nvcc"
nvccflags_default="-gencode arch=compute_30,code=sm_30 -O2"
striptype="direct" striptype="direct"
# OS # OS
...@@ -4220,6 +4227,20 @@ windres_default="${cross_prefix}${windres_default}" ...@@ -4220,6 +4227,20 @@ windres_default="${cross_prefix}${windres_default}"
sysinclude_default="${sysroot}/usr/include" sysinclude_default="${sysroot}/usr/include"
if enabled cuda_sdk; then
warn "Option --enable-cuda-sdk is deprecated. Use --enable-cuda-nvcc instead."
enable cuda_nvcc
fi
if enabled cuda_nvcc; then
nvcc_default="nvcc"
nvccflags_default="-gencode arch=compute_30,code=sm_30 -O2"
else
nvcc_default="clang"
nvccflags_default="--cuda-gpu-arch=sm_30 -O2"
NVCC_C=""
fi
set_default arch cc cxx doxygen pkg_config ranlib strip sysinclude \ set_default arch cc cxx doxygen pkg_config ranlib strip sysinclude \
target_exec x86asmexe nvcc target_exec x86asmexe nvcc
enabled cross_compile || host_cc_default=$cc enabled cross_compile || host_cc_default=$cc
...@@ -6060,9 +6081,21 @@ check_type "d3d9.h dxva2api.h" DXVA2_ConfigPictureDecode -D_WIN32_WINNT=0x0602 ...@@ -6060,9 +6081,21 @@ check_type "d3d9.h dxva2api.h" DXVA2_ConfigPictureDecode -D_WIN32_WINNT=0x0602
check_type "vdpau/vdpau.h" "VdpPictureInfoHEVC" check_type "vdpau/vdpau.h" "VdpPictureInfoHEVC"
if enabled cuda_sdk; then if [ -z "$nvccflags" ]; then
warn "Option --enable-cuda-sdk is deprecated. Use --enable-cuda-nvcc instead." nvccflags=$nvccflags_default
enable cuda_nvcc fi
if enabled x86_64 || enabled ppc64 || enabled aarch64; then
nvccflags="$nvccflags -m64"
else
nvccflags="$nvccflags -m32"
fi
if enabled cuda_nvcc; then
nvccflags="$nvccflags -ptx"
else
nvccflags="$nvccflags -S -nocudalib -nocudainc --cuda-device-only -include${source_link}/compat/cuda/cuda_runtime.h"
check_nvcc cuda_llvm
fi fi
if ! disabled ffnvcodec; then if ! disabled ffnvcodec; then
...@@ -6140,7 +6173,7 @@ for func in $COMPLEX_FUNCS; do ...@@ -6140,7 +6173,7 @@ for func in $COMPLEX_FUNCS; do
done done
# these are off by default, so fail if requested and not available # these are off by default, so fail if requested and not available
enabled cuda_nvcc && { check_nvcc || die "ERROR: failed checking for nvcc."; } enabled cuda_nvcc && { check_nvcc cuda_nvcc || die "ERROR: failed checking for nvcc."; }
enabled chromaprint && require chromaprint chromaprint.h chromaprint_get_version -lchromaprint enabled chromaprint && require chromaprint chromaprint.h chromaprint_get_version -lchromaprint
enabled decklink && { require_headers DeckLinkAPI.h && enabled decklink && { require_headers DeckLinkAPI.h &&
{ test_cpp_condition DeckLinkAPIVersion.h "BLACKMAGIC_DECKLINK_API_VERSION >= 0x0a090500" || die "ERROR: Decklink API version must be >= 10.9.5."; } } { test_cpp_condition DeckLinkAPIVersion.h "BLACKMAGIC_DECKLINK_API_VERSION >= 0x0a090500" || die "ERROR: Decklink API version must be >= 10.9.5."; } }
...@@ -6701,16 +6734,6 @@ if [ -z "$optflags" ]; then ...@@ -6701,16 +6734,6 @@ if [ -z "$optflags" ]; then
fi fi
fi fi
if [ -z "$nvccflags" ]; then
nvccflags=$nvccflags_default
fi
if enabled x86_64 || enabled ppc64 || enabled aarch64; then
nvccflags="$nvccflags -m64"
else
nvccflags="$nvccflags -m32"
fi
check_optflags(){ check_optflags(){
check_cflags "$@" check_cflags "$@"
enabled lto && check_ldflags "$@" enabled lto && check_ldflags "$@"
......
...@@ -38,7 +38,6 @@ OBJCCFLAGS = $(CPPFLAGS) $(CFLAGS) $(OBJCFLAGS) ...@@ -38,7 +38,6 @@ OBJCCFLAGS = $(CPPFLAGS) $(CFLAGS) $(OBJCFLAGS)
ASFLAGS := $(CPPFLAGS) $(ASFLAGS) ASFLAGS := $(CPPFLAGS) $(ASFLAGS)
CXXFLAGS := $(CPPFLAGS) $(CFLAGS) $(CXXFLAGS) CXXFLAGS := $(CPPFLAGS) $(CFLAGS) $(CXXFLAGS)
X86ASMFLAGS += $(IFLAGS:%=%/) -I$(<D)/ -Pconfig.asm X86ASMFLAGS += $(IFLAGS:%=%/) -I$(<D)/ -Pconfig.asm
NVCCFLAGS += -ptx
HOSTCCFLAGS = $(IFLAGS) $(HOSTCPPFLAGS) $(HOSTCFLAGS) HOSTCCFLAGS = $(IFLAGS) $(HOSTCPPFLAGS) $(HOSTCFLAGS)
LDFLAGS := $(ALLFFLIBS:%=$(LD_PATH)lib%) $(LDFLAGS) LDFLAGS := $(ALLFFLIBS:%=$(LD_PATH)lib%) $(LDFLAGS)
...@@ -91,7 +90,7 @@ COMPILE_NVCC = $(call COMPILE,NVCC) ...@@ -91,7 +90,7 @@ COMPILE_NVCC = $(call COMPILE,NVCC)
%.h.c: %.h.c:
$(Q)echo '#include "$*.h"' >$@ $(Q)echo '#include "$*.h"' >$@
%.ptx: %.cu %.ptx: %.cu $(SRC_PATH)/compat/cuda/cuda_runtime.h
$(COMPILE_NVCC) $(COMPILE_NVCC)
%.ptx.c: %.ptx %.ptx.c: %.ptx
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment