Ticket #384: csc_nvcuda-withcustomkernel.patch
File csc_nvcuda-withcustomkernel.patch, 33.4 KB (added by , 9 years ago) |
---|
-
xpra/codecs/csc_nvcuda/CUDA_kernels.py
1 # This file is part of Xpra. 2 # Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk> 3 # Xpra is released under the terms of the GNU GPL v2, or, at your option, any 4 # later version. See the file COPYING for details. 5 6 YUV_FORMATS = ("YUV444P", "YUV422P", "YUV420P") 7 8 def rgb_only_name(rgb_format): 9 #strip out X and A: 10 n = rgb_format.replace("A", "").replace("X", "").upper() 11 for x in n: 12 assert x in ("R", "G", "B"), "invalid character found in rgb format: '%s' in %s" % (x, rgb_format) 13 return n 14 15 def gen_rgb_to_nv12_kernel(rgb_format): 16 R = rgb_format.find("R") 17 G = rgb_format.find("G") 18 B = rgb_format.find("B") 19 assert R>=0 and G>=0 and B>=0, "invalid format: %s" % rgb_format 20 21 kernel_name = "%s_to_NV12" % rgb_only_name(rgb_format) 22 args = [kernel_name] + [R, G, B] * 4; 23 return kernel_name, """ 24 #include <stdint.h> 25 26 __global__ void %s(uint8_t *srcImage, int srcPitch, 27 uint8_t *dstImage, int dstPitch, int dstHeight, 28 int w, int h) 29 { 30 uint32_t gx, gy; 31 gx = blockIdx.x * blockDim.x + threadIdx.x; 32 gy = blockIdx.y * blockDim.y + threadIdx.y; 33 34 if ((gx*2 < w) & (gy*2 < h)) { 35 //4 bytes per pixel, and 2 pixels width/height at a time: 36 uint32_t si = (gy * 2 * srcPitch) + gx * 2 * 4; 37 38 //we may read up to 4 32-bit RGB pixels: 39 uint8_t R[4]; 40 uint8_t G[4]; 41 uint8_t B[4]; 42 uint8_t j = 0; 43 R[0] = srcImage[si+%s]; 44 G[0] = srcImage[si+%s]; 45 B[0] = srcImage[si+%s]; 46 for (j=0; j<4; j++) { 47 R[j] = R[0]; 48 G[j] = G[0]; 49 B[j] = B[0]; 50 } 51 52 //write up to 4 Y pixels: 53 uint32_t di = (gy * 2 * dstPitch) + gx * 2; 54 dstImage[di] = __float2int_rn(0.257 * R[0] + 0.504 * G[0] + 0.098 * B[0] + 16); 55 if (gx*2 + 1 < w) { 56 R[1] = srcImage[si+%s]; 57 G[1] = srcImage[si+%s]; 58 B[1] = srcImage[si+%s]; 59 dstImage[di + 1] = __float2int_rn(0.257 * R[1] + 0.504 * G[1] + 0.098 * B[1] + 16); 60 } 61 if (gy*2 + 1 < h) { 62 si += srcPitch; 63 di += dstPitch; 64 R[2] = srcImage[si+2]; 65 G[2] = srcImage[si+1]; 66 B[2] = srcImage[si]; 67 dstImage[di] = __float2int_rn(0.257 * R[2] + 0.504 * G[2] + 0.098 * B[2] + 16); 68 if (gx*2 + 1 < w) { 69 R[3] = srcImage[si+%s]; 70 G[3] = srcImage[si+%s]; 71 B[3] = srcImage[si+%s]; 72 dstImage[di + 1] = __float2int_rn(0.257 * R[3] + 0.504 * G[3] + 0.098 * B[3] + 16); 73 } 74 } 75 76 //write 1 U and 1 V pixel: 77 float_t u = 0; 78 float_t v = 0; 79 for (j=0; j<4; j++) { 80 u += -0.148 * R[j] - 0.291 * G[j] + 0.439 * B[j] + 128; 81 v += 0.439 * R[j] - 0.368 * G[j] - 0.071 * B[j] + 128; 82 } 83 di = (dstHeight + gy) * dstPitch + gx * 2; 84 dstImage[di] = __float2int_rn(u / 4.0); 85 dstImage[di + 1] = __float2int_rn(v / 4.0); 86 } 87 } 88 """ % args 89 90 def gen_rgb_to_yuv444p_kernel(rgb_format): 91 R = rgb_format.find("R") 92 G = rgb_format.find("G") 93 B = rgb_format.find("B") 94 assert R>=0 and G>=0 and B>=0, "invalid format: %s" % rgb_format 95 96 kernel_name = "%s_to_YUV444P" % rgb_only_name(rgb_format) 97 args = [kernel_name] + [R, G, B]; 98 return kernel_name, """ 99 #include <stdint.h> 100 101 __global__ void %s(uint8_t *srcImage, int srcPitch, 102 uint8_t *Y, int strideY, 103 uint8_t *U, int strideU, 104 uint8_t *V, int strideV, 105 int w, int h) 106 { 107 uint32_t gx, gy; 108 gx = blockIdx.x * blockDim.x + threadIdx.x; 109 gy = blockIdx.y * blockDim.y + threadIdx.y; 110 111 if ((gx < w) & (gy < h)) { 112 //4 bytes per pixel: 113 uint32_t si = (gy * srcPitch) + gx * 4; 114 uint8_t R = srcImage[si+%s]; 115 uint8_t G = srcImage[si+%s]; 116 uint8_t B = srcImage[si+%s]; 117 118 Y[(gy * strideY) + gx] = __float2int_rn( 0.257 * R + 0.504 * G + 0.098 * B + 16); 119 U[(gy * strideU) + gx] = __float2int_rn(-0.148 * R - 0.291 * G + 0.439 * B + 128); 120 V[(gy * strideV) + gx] = __float2int_rn( 0.439 * R - 0.368 * G - 0.071 * B + 128); 121 } 122 """ % args 123 124 def gen_rgb_to_yuv422p_kernel(rgb_format): 125 R = rgb_format.find("R") 126 G = rgb_format.find("G") 127 B = rgb_format.find("B") 128 assert R>=0 and G>=0 and B>=0, "invalid format: %s" % rgb_format 129 130 kernel_name = "%s_to_YUV422P" % rgb_only_name(rgb_format) 131 args = [kernel_name] + [R, G, B] * 2; 132 return kernel_name, """ 133 #include <stdint.h> 134 135 __global__ void %s(uint8_t *srcImage, int srcPitch, 136 uint8_t *Y, int strideY, 137 uint8_t *U, int strideU, 138 uint8_t *V, int strideV, 139 int w, int h) 140 { 141 uint32_t gx, gy; 142 gx = blockIdx.x * blockDim.x + threadIdx.x; 143 gy = blockIdx.y * blockDim.y + threadIdx.y; 144 145 if ((gx*2 < w) & (gy < h)) { 146 //4 bytes per pixel, reading up to 2 pixels at a time: 147 uint32_t si = (gy * srcPitch) + gx * 4 * 2; 148 149 uint8_t R[2]; 150 uint8_t G[2]; 151 uint8_t B[2]; 152 uint8_t j = 0; 153 154 R[0] = srcImage[si+%s]; 155 G[0] = srcImage[si+%s]; 156 B[0] = srcImage[si+%s]; 157 R[1] = R[0]; 158 G[1] = G[0]; 159 B[1] = B[0]; 160 161 //write up to 2 Y pixels: 162 uint i = gy*strideY + gx*2; 163 dstY[i] = __float2int_rn(0.257 * R[0] + 0.504 * G[0] + 0.098 * B[0] + 16); 164 if (gx*2 + 1 < w) { 165 R[1] = srcImage[si+4+%s]; 166 G[1] = srcImage[si+4+%s]; 167 B[1] = srcImage[si+4+%s]; 168 dstY[i+1] = __float2int_rn(0.257 * R[1] + 0.504 * G[1] + 0.098 * B[1] + 16); 169 } 170 171 //write 1 U and 1 V pixel: 172 float sumu = 0; 173 float sumv = 0; 174 for (j=0; j<2; j++) { 175 sumu += -0.148 * R[j] - 0.291 * G[j] + 0.439 * B[j] + 128; 176 sumv += 0.439 * R[j] - 0.368 * G[j] - 0.071 * B[j] + 128; 177 } 178 U[(gy * strideU) + gx] = __float2int_rn( sumu / 2.0); 179 V[(gy * strideV) + gx] = __float2int_rn( sumv / 2.0); 180 } 181 """ % args 182 183 184 def gen_rgb_to_yuv420p_kernel(rgb_format): 185 R = rgb_format.find("R") 186 G = rgb_format.find("G") 187 B = rgb_format.find("B") 188 assert R>=0 and G>=0 and B>=0, "invalid format: %s" % rgb_format 189 190 kernel_name = "%s_to_YUV420P" % rgb_only_name(rgb_format) 191 args = [kernel_name] + [R, G, B] * 4; 192 return kernel_name, """ 193 #include <stdint.h> 194 195 __global__ void %s(uint8_t *srcImage, int srcPitch, 196 uint8_t *Y, int strideY, 197 uint8_t *U, int strideU, 198 uint8_t *V, int strideV, 199 int w, int h) 200 { 201 uint32_t gx, gy; 202 gx = blockIdx.x * blockDim.x + threadIdx.x; 203 gy = blockIdx.y * blockDim.y + threadIdx.y; 204 205 if ((gx*2 < w) & (gy*2 < h)) { 206 //4 bytes per pixel, reading up to 4 pixels at a time (2 in width and 2 in height): 207 uint32_t si = (gy * 2 * srcPitch) + gx * 4 * 2; 208 209 uint8_t R[4]; 210 uint8_t G[4]; 211 uint8_t B[4]; 212 uint8_t j = 0; 213 214 R[0] = srcImage[si+%s]; 215 G[0] = srcImage[si+%s]; 216 B[0] = srcImage[si+%s]; 217 for (j=0; j<4; j++) { 218 R[j] = R[0]; 219 G[j] = G[0]; 220 B[j] = B[0]; 221 } 222 223 //write up to 4 Y pixels: 224 uint i = gy*2*strideY + gx*2; 225 dstY[i] = __float2int_rn(0.257 * R[0] + 0.504 * G[0] + 0.098 * B[0] + 16); 226 if (gx*2 + 1 < w) { 227 R[1] = srcImage[si+4+%s]; 228 G[1] = srcImage[si+4+%s]; 229 B[1] = srcImage[si+4+%s]; 230 dstY[i+1] = __float2int_rn(0.257 * R[1] + 0.504 * G[1] + 0.098 * B[1] + 16); 231 } 232 if (gy*2 + 1 < h) { 233 i += strideY; 234 si += srcPitch; 235 R[2] = srcImage[si+%s]; 236 G[2] = srcImage[si+%s]; 237 B[2] = srcImage[si+%s]; 238 dstY[i] = __float2int_rn(0.257 * R[2] + 0.504 * G[2] + 0.098 * B[2] + 16); 239 if (gx*2 + 1 < w) { 240 R[3] = srcImage[si+4+%s]; 241 G[3] = srcImage[si+4+%s]; 242 B[3] = srcImage[si+4+%s]; 243 dstY[i+1] = __float2int_rn(0.257 * R[3] + 0.504 * G[3] + 0.098 * B[3] + 16); 244 } 245 } 246 247 //write 1 U and 1 V pixel: 248 float sumu = 0; 249 float sumv = 0; 250 for (j=0; j<4; j++) { 251 sumu += -0.148 * R[j] - 0.291 * G[j] + 0.439 * B[j] + 128; 252 sumv += 0.439 * R[j] - 0.368 * G[j] - 0.071 * B[j] + 128; 253 } 254 U[(gy * strideU) + gx] = __float2int_rn( sumu / 4.0); 255 V[(gy * strideV) + gx] = __float2int_rn( sumv / 4.0); 256 } 257 """ % args 258 259 260 261 RGB_to_YUV_generators = { 262 "YUV444P" : gen_rgb_to_yuv444p_kernel, 263 "YUV422P" : gen_rgb_to_yuv422p_kernel, 264 "YUV420P" : gen_rgb_to_yuv420p_kernel, 265 } 266 267 def gen_rgb_to_yuv_kernels(rgb_mode="RGBX", yuv_modes=YUV_FORMATS): 268 RGB_to_YUV_KERNELS = {} 269 for yuv in yuv_modes: 270 gen = RGB_to_YUV_generators.get(yuv) 271 assert gen is not None, "no generator found for yuv mode %s" % yuv 272 RGB_to_YUV_KERNELS[(rgb_mode, yuv)] = gen(rgb_mode) 273 return RGB_to_YUV_KERNELS -
xpra/codecs/csc_nvcuda/colorspace_converter.py
3 3 # Xpra is released under the terms of the GNU GPL v2, or, at your option, any 4 4 # later version. See the file COPYING for details. 5 5 6 6 from xpra.codecs.csc_nvcuda.CUDA_kernels import gen_rgb_to_yuv_kernels 7 7 from xpra.codecs.image_wrapper import ImageWrapper 8 8 from xpra.codecs.codec_constants import codec_spec, get_subsampling_divs 9 9 from xpra.log import Logger, debug_if_env … … 11 11 debug = debug_if_env(log, "XPRA_CUDA_DEBUG") 12 12 error = log.error 13 13 14 import os 14 15 import numpy 15 16 import time 16 import ctypes17 import sys18 17 assert bytearray 19 import pycuda #@UnresolvedImport 20 from pycuda import driver #@UnresolvedImport 18 import pycuda #@UnresolvedImport 19 from pycuda import driver #@UnresolvedImport 20 from pycuda.compiler import SourceModule #@UnresolvedImport 21 21 driver.init() 22 22 23 23 def log_sys_info(): … … 27 27 def device_info(d): 28 28 return "%s @ %s" % (d.name(), d.pci_bus_id()) 29 29 30 DEFAULT_CUDA_DEVICE_ID = int(os.environ.get("XPRA_CUDA_DEVICE", "0")) 31 30 32 def select_device(): 31 33 ngpus = driver.Device.count() 32 34 log.info("PyCUDA found %s devices:", ngpus) … … 41 43 #debug("CAN_MAP_HOST_MEMORY=%s", host_mem) 42 44 #attr = d.get_attributes() 43 45 #debug("compute_capability=%s, attributes=%s", d.compute_capability(), attr) 44 if host_mem and device is None:46 if host_mem and (device is None or i==DEFAULT_CUDA_DEVICE_ID): 45 47 device = d 46 48 return device 47 49 assert select_device() is not None … … 76 78 context_wrapper = CudaContextWrapper(context) 77 79 context.pop() 78 80 79 def find_lib(basename):80 try:81 if sys.platform == "win32":82 libname = basename+".dll"83 else:84 libname = basename+".so"85 return ctypes.cdll.LoadLibrary(libname)86 except Exception, e:87 debug("could not find %s: %s", basename, e)88 return None89 90 _NPP_LIBRARY_NAMES = ["libnppi", #CUDA5.591 "libnpp"] #CUDA5.092 _NPP_LIBRARIES = []93 for name in _NPP_LIBRARY_NAMES:94 lib = find_lib(name)95 if lib:96 _NPP_LIBRARIES.append(lib)97 if len(_NPP_LIBRARIES)==0:98 raise ImportError("failed to load npp library - check your library path")99 100 #try to get the npp version:101 class NppLibraryVersion(ctypes.Structure):102 _fields_ = [("major", ctypes.c_int),103 ("minor", ctypes.c_int),104 ("build", ctypes.c_int)]105 try:106 nppGetLibVersion = None107 for lib in _NPP_LIBRARIES:108 if hasattr(lib, "nppGetLibVersion"):109 nppGetLibVersion = getattr(lib, "nppGetLibVersion")110 if nppGetLibVersion:111 nppGetLibVersion.argtypes = []112 nppGetLibVersion.restype = ctypes.POINTER(NppLibraryVersion)113 v = nppGetLibVersion().contents114 log.info("found npp library version %s.%s.%s", v.major, v.minor, v.build)115 except:116 log.warn("error getting npp version", exc_info=True)117 118 119 class NppiSize(ctypes.Structure):120 _fields_ = [("width", ctypes.c_int),121 ("height", ctypes.c_int)]122 123 def Npp8u_p(buf):124 return ctypes.cast(int(buf), ctypes.c_void_p)125 126 RGB_to_YUV444P_argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, ctypes.c_int, NppiSize]127 RGB_to_YUV42xP_argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, (ctypes.c_int)*3, NppiSize]128 129 YUV444P_to_RGB_argtypes = [(ctypes.c_void_p)*3, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, NppiSize]130 YUV42xP_to_RGB_argtypes = [(ctypes.c_void_p)*3, (ctypes.c_int)*3, ctypes.c_void_p, ctypes.c_int, NppiSize]131 CONSTANT_ALPHA = ctypes.c_uint8132 133 134 COLORSPACES_MAP_STR = {135 #("RGBX", "YUV444P") : ("nppiRGBToYCbCr_8u_C3P3R", RGB_to_YUV444P_argtypes),136 ("RGBA", "YUV444P") : ("nppiRGBToYCbCr_8u_AC4P3R", RGB_to_YUV444P_argtypes),137 #("YUV444P", "RGB") : ("nppiYCbCrToRGB_8u_P3C3R", YUV444P_to_RGB_argtypes),138 #("YUV444P", "BGR") : ("nppiYCbCrToBGR_8u_P3C3R", YUV444P_to_RGB_argtypes),139 ("YUV444P", "RGBX") : ("nppiYCbCrToRGB_8u_P3C4R", YUV444P_to_RGB_argtypes+[CONSTANT_ALPHA]),140 ("YUV444P", "BGRX") : ("nppiYCbCrToBGR_8u_P3C4R", YUV444P_to_RGB_argtypes+[CONSTANT_ALPHA]),141 #BGR / BGRA: need nppiSwap(Channels before one of the above142 #("RGBX", "YUV422P") : ("nppiRGBToYCbCr422_8u_C3P3R", RGB_to_YUV42xP_argtypes),143 #("BGRX", "YUV422P") : ("nppiBGRToYCbCr422_8u_C3P3R", RGB_to_YUV42xP_argtypes),144 ("BGRX", "YUV422P") : ("nppiBGRToYCbCr422_8u_AC4P3R", RGB_to_YUV42xP_argtypes),145 #("YUV422P", "RGB") : ("nppiYCbCr422ToRGB_8u_P3C3R", YUV42xP_to_RGB_argtypes),146 #("YUV422P", "BGR") : ("nppiYCbCr422ToBGR_8u_P3C3R", YUV42xP_to_RGB_argtypes),147 #YUV420P:148 #("RGBX", "YUV420P") : ("nppiRGBToYCbCr420_8u_C3P3R", RGB_to_YUV42xP_argtypes),149 #("BGRX", "YUV420P") : ("nppiBGRToYCbCr420_8u_C3P3R", RGB_to_YUV42xP_argtypes),150 ("RGBX", "YUV420P") : ("nppiRGBToYCrCb420_8u_AC4P3R", RGB_to_YUV42xP_argtypes),151 ("BGRX", "YUV420P") : ("nppiBGRToYCbCr420_8u_AC4P3R", RGB_to_YUV42xP_argtypes),152 #("YUV420P", "RGB") : ("nppiYCbCr420ToRGB_8u_P3C3R", YUV42xP_to_RGB_argtypes),153 #("YUV420P", "BGR") : ("nppiYCbCr420ToBGR_8u_P3C3R", YUV42xP_to_RGB_argtypes),154 #("YUV420P", "RGBX") : ("nppiYCrCb420ToRGB_8u_P3C4R", YUV42xP_to_RGB_argtypes),155 #("YUV420P", "BGRX") : ("nppiYCbCr420ToBGR_8u_P3C4R", YUV42xP_to_RGB_argtypes),156 }157 #ie:158 #BGR to YUV420P:159 #NppStatus nppiBGRToYCbCr420_8u_C3P3R (const Npp8u *pSrc, int nSrcStep, Npp8u *pDst[3], int rDstStep[3], NppiSize oSizeROI)160 #pSrc Source-Image Pointer.161 #nSrcStep Source-Image Line Step.162 #pDst Destination-Planar-Image Pointer Array.163 #rDstStep Destination-Planar-Image Line Step Array.164 #oSizeROI Region-of-Interest (ROI). (struct with width and height)165 #Returns:166 #Image Data Related Error Codes, ROI Related Error Codes167 168 #For YUV444P:169 #NppStatus nppiRGBToYCbCr_8u_C3P3R(const Npp8u * pSrc, int nSrcStep, Npp8u * pDst[3], int nDstStep, NppiSize oSizeROI);170 #(only one nDstStep!171 172 #YUV420P to RGB:173 #NppStatus nppiYCbCrToRGB_8u_P3C3R(const Npp8u * const pSrc[3], int nSrcStep, Npp8u * pDst, int nDstStep, NppiSize oSizeROI);174 #YUV444P to RGB:175 #NppStatus nppiYCrCb420ToRGB_8u_P3C4R(const Npp8u * const pSrc[3],int rSrcStep[3], Npp8u * pDst, int nDstStep, NppiSize oSizeROI);176 #Those with alpha add:177 #Npp8u nAval178 179 180 NPP_NO_OPERATION_WARNING = 1181 NPP_DIVIDE_BY_ZERO_WARNING = 6182 NPP_AFFINE_QUAD_INCORRECT_WARNING = 28183 NPP_WRONG_INTERSECTION_ROI_WARNING = 29184 NPP_WRONG_INTERSECTION_QUAD_WARNING = 30185 NPP_DOUBLE_SIZE_WARNING = 35186 NPP_MISALIGNED_DST_ROI_WARNING = 10000187 188 WARNINGS = {189 NPP_NO_OPERATION_WARNING : "Indicates that no operation was performed",190 NPP_DIVIDE_BY_ZERO_WARNING: "Divisor is zero however does not terminate the execution",191 NPP_AFFINE_QUAD_INCORRECT_WARNING: "Indicates that the quadrangle passed to one of affine warping functions doesn't have necessary properties. First 3 vertices are used, the fourth vertex discarded",192 NPP_WRONG_INTERSECTION_ROI_WARNING: "The given ROI has no interestion with either the source or destination ROI. Thus no operation was performed",193 NPP_WRONG_INTERSECTION_QUAD_WARNING:"The given quadrangle has no intersection with either the source or destination ROI. Thus no operation was performed",194 NPP_DOUBLE_SIZE_WARNING: "Image size isn't multiple of two. Indicates that in case of 422/411/420 sampling the ROI width/height was modified for proper processing",195 NPP_MISALIGNED_DST_ROI_WARNING: "Speed reduction due to uncoalesced memory accesses warning"196 }197 NPP_STEP_ERROR = -14198 NPP_NOT_EVEN_STEP_ERROR = -108199 200 ERRORS = {201 NPP_STEP_ERROR : "Step is less or equal zero",202 NPP_NOT_EVEN_STEP_ERROR : "Step value is not pixel multiple",203 }204 205 206 YUV_INDEX_TO_PLANE = {207 0 : "Y",208 1 : "U",209 2 : "V"210 }211 212 213 81 def roundup(n, m): 214 82 return (n + m - 1) & ~(m - 1) 215 83 216 84 217 COLORSPACES_MAP = {} 218 for k, f_def in COLORSPACES_MAP_STR.items(): 219 fn, argtypes = f_def 220 try: 221 for lib in _NPP_LIBRARIES: 222 if hasattr(lib, fn): 223 cfn = getattr(lib, fn) 224 debug("found %s for %s in %s: %s", fn, k, lib, cfn) 225 COLORSPACES_MAP[k] = (fn, cfn) 226 #set argument types and return type: 227 cfn.restype = ctypes.c_int 228 cfn.argtypes = argtypes 229 except: 230 log.error("could not load '%s', conversion disabled: %s", fn, k) 85 COLORSPACES_MAP = { 86 "BGRA" : ("YUV420P", "YUV422P", "YUV444P"), 87 "BGRX" : ("YUV420P", "YUV422P", "YUV444P"), 88 "RGBA" : ("YUV420P", "YUV422P", "YUV444P"), 89 "RGBX" : ("YUV420P", "YUV422P", "YUV444P"), 90 } 91 KERNELS_MAP = {} 92 for rgb_format, yuv_formats in COLORSPACES_MAP.items(): 93 m = gen_rgb_to_yuv_kernels(rgb_format, yuv_formats) 94 KERNELS_MAP.update(m) 95 log.info("csc_nvcuda kernels: %s", KERNELS_MAP) 231 96 232 233 97 def get_type(): 234 98 return "nvcuda" 235 99 … … 237 101 return pycuda.VERSION_TEXT 238 102 239 103 def get_input_colorspaces(): 240 return sorted( set([src for src, _ in COLORSPACES_MAP.keys()]))104 return sorted(COLORSPACES_MAP.keys()) 241 105 242 106 def get_output_colorspaces(input_colorspace): 243 return sorted( set(dst for src,dst in COLORSPACES_MAP.keys() if src==input_colorspace))107 return sorted(COLORSPACES_MAP.get(input_colorspace)) 244 108 245 109 def validate_in_out(in_colorspace, out_colorspace): 246 110 assert in_colorspace in get_input_colorspaces(), "invalid input colorspace: %s (must be one of %s)" % (in_colorspace, get_input_colorspaces()) … … 262 126 self.dst_format = "" 263 127 self.time = 0 264 128 self.frames = 0 129 self.cuda_device = None 130 self.cuda_context = None 131 self.max_block_sizes = 0 132 self.max_grid_sizes = 0 133 self.max_threads_per_block = 0 265 134 self.kernel_function = None 135 self.kernel_function_name = None 266 136 self.context = None 267 137 268 138 def init_context(self, src_width, src_height, src_format, … … 276 146 self.dst_height = dst_height 277 147 self.dst_format = dst_format 278 148 self.context = context 279 k = (src_format, dst_format) 280 npp_fn = COLORSPACES_MAP.get(k) 281 assert npp_fn is not None, "invalid pair: %s" % k 282 self.kernel_function_name, cfn = npp_fn 283 debug("init_context%s npp conversion function=%s (%s)", (src_width, src_height, src_format, dst_width, dst_height, dst_format), self.kernel_function_name, cfn) 284 self.kernel_function = cfn 285 if src_format.find("YUV")>=0: 286 self.convert_image_fn = self.convert_image_yuv 287 else: 288 self.convert_image_fn = self.convert_image_rgb 149 assert self.src_width==self.dst_width and self.src_height==self.dst_height, "scaling is not supported! (%sx%s to %sx%s)" % (self.src_width, self.src_height, self.dst_width, self.dst_height) 150 151 self.init_cuda(0) 152 153 def init_cuda(self, device_id): 154 debug("init_cuda(%s)", device_id) 155 self.cuda_device = driver.Device(DEFAULT_CUDA_DEVICE_ID) 156 self.cuda_context = self.cuda_device.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST) 157 #use alias to make code easier to read: 158 d = self.cuda_device 159 da = driver.device_attribute 160 try: 161 debug("init_cuda(%s) cuda_device=%s, cuda_context=%s", device_id, self.cuda_device, self.cuda_context) 162 #compile/get kernel: 163 key = self.src_format, self.dst_format 164 k = KERNELS_MAP.get(key) 165 assert k is not None, "kernel not found for %s" % str(key) 166 self.kernel_function_name, ksrc = k 167 debug("init_cuda(%s) found kernel %s: %s", self.kernel_function_name, ksrc) 168 mod = SourceModule(ksrc) 169 self.kernel_function = mod.get_function(self.kernel_function_name) 170 171 self.max_block_sizes = d.get_attribute(da.MAX_BLOCK_DIM_X), d.get_attribute(da.MAX_BLOCK_DIM_Y), d.get_attribute(da.MAX_BLOCK_DIM_Z) 172 self.max_grid_sizes = d.get_attribute(da.MAX_GRID_DIM_X), d.get_attribute(da.MAX_GRID_DIM_Y), d.get_attribute(da.MAX_GRID_DIM_Z) 173 debug("max_block_sizes=%s", self.max_block_sizes) 174 debug("max_grid_sizes=%s", self.max_grid_sizes) 175 176 self.max_threads_per_block = self.BGRA2NV12.get_attribute(driver.function_attribute.MAX_THREADS_PER_BLOCK) 177 debug("max_threads_per_block=%s", self.max_threads_per_block) 178 179 self.init_nvenc() 180 finally: 181 self.cuda_context.pop() 182 183 self.convert_image_fn = self.convert_image_rgb 289 184 debug("init_context(..) convert_image=%s", self.convert_image) 290 185 291 186 def get_info(self): … … 348 243 finally: 349 244 self.context.pop() 350 245 351 def convert_image_yuv(self, image):352 global program353 start = time.time()354 iplanes = image.get_planes()355 width = image.get_width()356 height = image.get_height()357 strides = image.get_rowstride()358 pixels = image.get_pixels()359 debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels))360 assert iplanes==ImageWrapper._3_PLANES, "must use planar YUV as input"361 assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format)362 assert len(strides)==len(pixels)==3, "invalid number of planes (%s) or strides (%s), should be 3" % (len(strides), len(pixels))363 364 #YUV444P argtypes = [(ctypes.c_void_p)*3, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, NppiSize]365 #YUV42xP argtypes = [(ctypes.c_void_p)*3, (ctypes.c_int)*3, ctypes.c_void_p, ctypes.c_int, NppiSize]366 in_t = self.kernel_function.argtypes[0] #always: (ctypes.c_void_p)*3367 in_strides_t = self.kernel_function.argtypes[1] #(ctypes.c_int)*3 OR ctypes.c_int368 369 divs = get_subsampling_divs(self.src_format)370 371 stream = driver.Stream()372 #copy each plane to the GPU:373 upload_start = time.time()374 locked_mem = [] #reference to pinned memory375 in_bufs = [] #GPU side yuv channels376 in_strides = [] #GPU side strides377 for i in range(3):378 x_div, y_div = divs[i]379 stride = strides[i]380 assert stride >= width/x_div, \381 "invalid stride %s is smaller than plane %s width %s/%s" % (stride, YUV_INDEX_TO_PLANE.get(i, i), width, x_div)382 in_height = height/y_div383 plane = pixels[i]384 assert len(plane)>=stride*in_height385 386 mem = numpy.frombuffer(plane, dtype=numpy.byte)387 if True:388 #keeping stride as it is:389 in_buf = driver.mem_alloc(len(plane))390 in_bufs.append(in_buf)391 in_strides.append(stride)392 hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP)393 pycuda.driver.memcpy_htod_async(in_buf, mem, stream)394 else:395 #change stride to what we get from mem_alloc_pitch:396 in_buf, in_stride = driver.mem_alloc_pitch(stride, in_height, 4)397 in_bufs.append(in_buf)398 in_strides.append(in_stride)399 hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP)400 locked_mem.append(hmem)401 copy = driver.Memcpy2D()402 copy.set_src_host(hmem)403 copy.set_dst_device(in_buf)404 copy.src_pitch = stride405 copy.dst_pitch = in_stride406 copy.width_in_bytes = stride407 copy.height = in_height408 copy(stream)409 stream.synchronize()410 #all the copying is complete, we can unpin the host memory:411 for hmem in locked_mem:412 hmem.base.unregister()413 upload_end = time.time()414 debug("%s pixels now on GPU at %s, took %.1fms", sum([len(plane) for plane in pixels]), in_bufs, upload_end-upload_start)415 416 #allocate output RGB buffer on CPU:417 out_buf, out_stride = driver.mem_alloc_pitch(width*4, height, 4)418 src = in_t(*[Npp8u_p(in_buf) for in_buf in in_bufs])419 if in_strides_t==ctypes.c_int:420 #one stride for all planes (this must be YUV444P)421 assert len(set(in_strides))==1, "expected only one stride: %s" % str(in_strides)422 in_strides = [in_strides[0]]423 debug("in_strides=%s, out_stride=%s", in_strides, out_stride)424 kargs = [src, in_strides_t(*in_strides), Npp8u_p(out_buf), ctypes.c_int(out_stride), NppiSize(width, height)]425 if self.kernel_function.argtypes[-1]==CONSTANT_ALPHA:426 #add hardcoded constant alpha:427 kargs.append(ctypes.c_uint8(255))428 debug("calling %s%s", self.kernel_function_name, tuple(kargs))429 kstart = time.time()430 v = self.kernel_function(*kargs)431 #we can now free the GPU source planes:432 for in_buf in in_bufs:433 in_buf.free()434 if v<0:435 log.error("%s%s returned an error: %s", self.kernel_function_name, kargs, ERRORS.get(v, v))436 return None437 elif v>0 and v!=NPP_DOUBLE_SIZE_WARNING:438 #positive return-codes indicate warnings:439 warning = WARNINGS.get(v, "unknown")440 log.warn("%s returned a warning %s: %s", self.kernel_function_name, v, warning)441 kend = time.time()442 debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0)443 444 self.frames += 1445 446 read_start = time.time()447 gpu_size = out_stride*height448 min_size = width*4*height449 if gpu_size<=2*min_size:450 #direct full buffer async copy with GPU padding:451 pixels = driver.pagelocked_empty(stride*height, dtype=numpy.byte)452 driver.memcpy_dtoh_async(pixels, out_buf, stream)453 else:454 #we don't want the crazy large GPU padding, so we do it ourselves:455 stride = width*4456 pixels = driver.pagelocked_empty(stride*height, dtype=numpy.byte)457 copy = driver.Memcpy2D()458 copy.set_src_device(out_buf)459 copy.set_dst_host(pixels)460 copy.src_pitch = out_stride461 copy.dst_pitch = stride462 copy.width_in_bytes = width*4463 copy.height = height464 copy(stream)465 stream.synchronize()466 467 #the pixels have been copied, we can free the GPU output memory:468 out_buf.free()469 self.context.synchronize()470 read_end = time.time()471 debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start))472 return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels.data, self.dst_format, 24, out_stride, planes=ImageWrapper.PACKED)473 474 475 246 def convert_image_rgb(self, image): 476 247 global program 477 248 start = time.time() 478 249 iplanes = image.get_planes() 479 w idth= image.get_width()480 h eight= image.get_height()250 w = image.get_width() 251 h = image.get_height() 481 252 stride = image.get_rowstride() 482 253 pixels = image.get_pixels() 483 254 debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels)) 484 255 assert iplanes==ImageWrapper.PACKED, "must use packed format as input" 485 256 assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format) 486 487 257 divs = get_subsampling_divs(self.dst_format) 488 258 489 259 #copy packed rgb pixels to GPU: 490 260 upload_start = time.time() 491 261 stream = driver.Stream() 492 262 mem = numpy.frombuffer(pixels, dtype=numpy.byte) 493 #keeping stride as it is:494 #the non async/pinned version is simple but slower:495 # gpu_image = driver.to_device(pixels)496 #followed by:497 # gpu_image.free()498 263 in_buf = driver.mem_alloc(len(pixels)) 499 264 hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP) 500 265 pycuda.driver.memcpy_htod_async(in_buf, mem, stream) 501 266 502 #YUV444P argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, ctypes.c_int, NppiSize]503 #YUV42xP argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, (ctypes.c_int)*3, NppiSize]504 out_t = self.kernel_function.argtypes[2] #always: (ctypes.c_void_p)*3505 out_strides_t = self.kernel_function.argtypes[3] #(ctypes.c_int)*3 OR ctypes.c_int506 267 out_bufs = [] 507 268 out_strides = [] 508 269 out_sizes = [] 509 270 for i in range(3): 510 271 x_div, y_div = divs[i] 511 out_stride = roundup( width/x_div, 4)512 out_height = roundup( height/y_div, 2)272 out_stride = roundup(self.dst_width/x_div, 4) 273 out_height = roundup(self.dst_height/y_div, 2) 513 274 out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4) 514 275 out_bufs.append(out_buf) 515 276 out_strides.append(out_stride) 516 277 out_sizes.append((out_stride, out_height)) 517 dest = out_t(*[ctypes.cast(int(out_buf), ctypes.c_void_p) for out_buf in out_bufs]) 518 if out_strides_t==ctypes.c_int: 519 #one stride for all planes (this must be YUV444P) 520 assert len(set(out_strides))==1, "more than one stride where only one expected in: %s" % out_strides 521 out_strides = [out_strides[0]] 522 kargs = [Npp8u_p(in_buf), ctypes.c_int(stride), dest, out_strides_t(*out_strides), NppiSize(width, height)] 523 #ensure copying has finished: 278 #ensure uploading has finished: 524 279 stream.synchronize() 525 280 #we can now unpin the host memory: 526 281 hmem.base.unregister() 527 debug("allocation took %.1fms", 1000.0*(time.time() - upload_start))282 debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start)) 528 283 529 debug("calling %s%s", self.kernel_function_name, tuple(kargs))530 284 kstart = time.time() 531 v = self.kernel_function(*kargs) 285 kargs = [in_buf, stride] 286 for i in range(3): 287 kargs.append(out_bufs[i]) 288 kargs.append(numpy.int32(out_strides[i])) 289 blockw, blockh = 16, 16 290 #divide each dimension by 2 since we process 4 pixels at a time: 291 gridw = max(1, w/blockw/2) 292 if gridw*2*blockw<w: 293 gridw += 1 294 gridh = max(1, h/blockh/2) 295 if gridh*2*blockh<h: 296 gridh += 1 297 debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh)) 298 self.kernel_function(*kargs, 299 block=(blockw,blockh,1), grid=(gridw, gridh)) 300 532 301 #we can now free the GPU source buffer: 533 302 in_buf.free() 534 if v<0:535 log.error("%s%s returned an error: %s", self.kernel_function_name, kargs, ERRORS.get(v, v))536 return None537 elif v>0 and v!=NPP_DOUBLE_SIZE_WARNING:538 #positive return-codes indicate warnings:539 warning = WARNINGS.get(v, "unknown")540 log.warn("%s returned a warning %s: %s", self.kernel_function_name, v, warning)541 303 kend = time.time() 542 304 debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0) 543 305 self.frames += 1