xpra icon
Bug tracker and wiki

This bug tracker and wiki are being discontinued
please use https://github.com/Xpra-org/xpra instead.


Ticket #384: csc_nvcuda-withcustomkernel.patch

File csc_nvcuda-withcustomkernel.patch, 33.4 KB (added by Antoine Martin, 9 years ago)

use code similar to #370 (custom kernels) instead of the useless nvidia npp

  • xpra/codecs/csc_nvcuda/CUDA_kernels.py

     
     1# This file is part of Xpra.
     2# Copyright (C) 2013 Antoine Martin <antoine@devloop.org.uk>
     3# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
     4# later version. See the file COPYING for details.
     5
     6YUV_FORMATS = ("YUV444P", "YUV422P", "YUV420P")
     7
     8def rgb_only_name(rgb_format):
     9    #strip out X and A:
     10    n = rgb_format.replace("A", "").replace("X", "").upper()
     11    for x in n:
     12        assert x in ("R", "G", "B"), "invalid character found in rgb format: '%s' in %s" % (x, rgb_format)
     13    return n
     14
     15def gen_rgb_to_nv12_kernel(rgb_format):
     16    R = rgb_format.find("R")
     17    G = rgb_format.find("G")
     18    B = rgb_format.find("B")
     19    assert R>=0 and G>=0 and B>=0, "invalid format: %s" % rgb_format
     20
     21    kernel_name = "%s_to_NV12" % rgb_only_name(rgb_format)
     22    args = [kernel_name] + [R, G, B] * 4; 
     23    return kernel_name, """
     24#include <stdint.h>
     25
     26__global__ void %s(uint8_t *srcImage,    int srcPitch,
     27                   uint8_t *dstImage,    int dstPitch, int dstHeight,
     28                   int w,                int h)
     29{
     30    uint32_t gx, gy;
     31    gx = blockIdx.x * blockDim.x + threadIdx.x;
     32    gy = blockIdx.y * blockDim.y + threadIdx.y;
     33
     34    if ((gx*2 < w) & (gy*2 < h)) {
     35        //4 bytes per pixel, and 2 pixels width/height at a time:
     36        uint32_t si = (gy * 2 * srcPitch) + gx * 2 * 4;
     37
     38        //we may read up to 4 32-bit RGB pixels:
     39        uint8_t R[4];
     40        uint8_t G[4];
     41        uint8_t B[4];
     42        uint8_t j = 0;
     43        R[0] = srcImage[si+%s];
     44        G[0] = srcImage[si+%s];
     45        B[0] = srcImage[si+%s];
     46        for (j=0; j<4; j++) {
     47            R[j] = R[0];
     48            G[j] = G[0];
     49            B[j] = B[0];
     50        }
     51
     52        //write up to 4 Y pixels:
     53        uint32_t di = (gy * 2 * dstPitch) + gx * 2;
     54        dstImage[di] = __float2int_rn(0.257 * R[0] + 0.504 * G[0] + 0.098 * B[0] + 16);
     55        if (gx*2 + 1 < w) {
     56            R[1] = srcImage[si+%s];
     57            G[1] = srcImage[si+%s];
     58            B[1] = srcImage[si+%s];
     59            dstImage[di + 1] = __float2int_rn(0.257 * R[1] + 0.504 * G[1] + 0.098 * B[1] + 16);
     60        }
     61        if (gy*2 + 1 < h) {
     62            si += srcPitch;
     63            di += dstPitch;
     64            R[2] = srcImage[si+2];
     65            G[2] = srcImage[si+1];
     66            B[2] = srcImage[si];
     67            dstImage[di] = __float2int_rn(0.257 * R[2] + 0.504 * G[2] + 0.098 * B[2] + 16);
     68            if (gx*2 + 1 < w) {
     69                R[3] = srcImage[si+%s];
     70                G[3] = srcImage[si+%s];
     71                B[3] = srcImage[si+%s];
     72                dstImage[di + 1] = __float2int_rn(0.257 * R[3] + 0.504 * G[3] + 0.098 * B[3] + 16);
     73            }
     74        }
     75
     76        //write 1 U and 1 V pixel:
     77        float_t u = 0;
     78        float_t v = 0;
     79        for (j=0; j<4; j++) {
     80            u += -0.148 * R[j] - 0.291 * G[j] + 0.439 * B[j] + 128;
     81            v +=  0.439 * R[j] - 0.368 * G[j] - 0.071 * B[j] + 128;
     82        }
     83        di = (dstHeight + gy) * dstPitch + gx * 2;
     84        dstImage[di]      = __float2int_rn(u / 4.0);
     85        dstImage[di + 1]  = __float2int_rn(v / 4.0);
     86    }
     87}
     88    """ % args
     89
     90def gen_rgb_to_yuv444p_kernel(rgb_format):
     91    R = rgb_format.find("R")
     92    G = rgb_format.find("G")
     93    B = rgb_format.find("B")
     94    assert R>=0 and G>=0 and B>=0, "invalid format: %s" % rgb_format
     95
     96    kernel_name = "%s_to_YUV444P" % rgb_only_name(rgb_format)
     97    args = [kernel_name] + [R, G, B];
     98    return kernel_name, """
     99#include <stdint.h>
     100
     101__global__ void %s(uint8_t *srcImage,     int srcPitch,
     102                   uint8_t *Y,            int strideY,
     103                   uint8_t *U,            int strideU,
     104                   uint8_t *V,            int strideV,
     105                   int w,                 int h)
     106{
     107    uint32_t gx, gy;
     108    gx = blockIdx.x * blockDim.x + threadIdx.x;
     109    gy = blockIdx.y * blockDim.y + threadIdx.y;
     110
     111    if ((gx < w) & (gy < h)) {
     112        //4 bytes per pixel:
     113        uint32_t si = (gy * srcPitch) + gx * 4;
     114        uint8_t R = srcImage[si+%s];
     115        uint8_t G = srcImage[si+%s];
     116        uint8_t B = srcImage[si+%s];
     117
     118        Y[(gy * strideY) + gx] = __float2int_rn( 0.257 * R + 0.504 * G + 0.098 * B + 16);
     119        U[(gy * strideU) + gx] = __float2int_rn(-0.148 * R - 0.291 * G + 0.439 * B + 128);
     120        V[(gy * strideV) + gx] = __float2int_rn( 0.439 * R - 0.368 * G - 0.071 * B + 128);
     121}
     122    """ % args
     123
     124def gen_rgb_to_yuv422p_kernel(rgb_format):
     125    R = rgb_format.find("R")
     126    G = rgb_format.find("G")
     127    B = rgb_format.find("B")
     128    assert R>=0 and G>=0 and B>=0, "invalid format: %s" % rgb_format
     129
     130    kernel_name = "%s_to_YUV422P" % rgb_only_name(rgb_format)
     131    args = [kernel_name] + [R, G, B] * 2;
     132    return kernel_name, """
     133#include <stdint.h>
     134
     135__global__ void %s(uint8_t *srcImage,     int srcPitch,
     136                   uint8_t *Y,            int strideY,
     137                   uint8_t *U,            int strideU,
     138                   uint8_t *V,            int strideV,
     139                   int w,                 int h)
     140{
     141    uint32_t gx, gy;
     142    gx = blockIdx.x * blockDim.x + threadIdx.x;
     143    gy = blockIdx.y * blockDim.y + threadIdx.y;
     144
     145    if ((gx*2 < w) & (gy < h)) {
     146        //4 bytes per pixel, reading up to 2 pixels at a time:
     147        uint32_t si = (gy * srcPitch) + gx * 4 * 2;
     148
     149        uint8_t R[2];
     150        uint8_t G[2];
     151        uint8_t B[2];
     152        uint8_t j = 0;
     153
     154        R[0] = srcImage[si+%s];
     155        G[0] = srcImage[si+%s];
     156        B[0] = srcImage[si+%s];
     157        R[1] = R[0];
     158        G[1] = G[0];
     159        B[1] = B[0];
     160
     161        //write up to 2 Y pixels:
     162        uint i = gy*strideY + gx*2;
     163        dstY[i] = __float2int_rn(0.257 * R[0] + 0.504 * G[0] + 0.098 * B[0] + 16);
     164        if (gx*2 + 1 < w) {
     165            R[1] = srcImage[si+4+%s];
     166            G[1] = srcImage[si+4+%s];
     167            B[1] = srcImage[si+4+%s];
     168            dstY[i+1] = __float2int_rn(0.257 * R[1] + 0.504 * G[1] + 0.098 * B[1] + 16);
     169        }
     170
     171        //write 1 U and 1 V pixel:
     172        float sumu = 0;
     173        float sumv = 0;
     174        for (j=0; j<2; j++) {
     175            sumu += -0.148 * R[j] - 0.291 * G[j] + 0.439 * B[j] + 128;
     176            sumv +=  0.439 * R[j] - 0.368 * G[j] - 0.071 * B[j] + 128;
     177        }
     178        U[(gy * strideU) + gx] = __float2int_rn( sumu / 2.0);
     179        V[(gy * strideV) + gx] = __float2int_rn( sumv / 2.0);
     180}
     181    """ % args
     182
     183
     184def gen_rgb_to_yuv420p_kernel(rgb_format):
     185    R = rgb_format.find("R")
     186    G = rgb_format.find("G")
     187    B = rgb_format.find("B")
     188    assert R>=0 and G>=0 and B>=0, "invalid format: %s" % rgb_format
     189
     190    kernel_name = "%s_to_YUV420P" % rgb_only_name(rgb_format)
     191    args = [kernel_name] + [R, G, B] * 4;
     192    return kernel_name, """
     193#include <stdint.h>
     194
     195__global__ void %s(uint8_t *srcImage,     int srcPitch,
     196                   uint8_t *Y,            int strideY,
     197                   uint8_t *U,            int strideU,
     198                   uint8_t *V,            int strideV,
     199                   int w,                 int h)
     200{
     201    uint32_t gx, gy;
     202    gx = blockIdx.x * blockDim.x + threadIdx.x;
     203    gy = blockIdx.y * blockDim.y + threadIdx.y;
     204
     205    if ((gx*2 < w) & (gy*2 < h)) {
     206        //4 bytes per pixel, reading up to 4 pixels at a time (2 in width and 2 in height):
     207        uint32_t si = (gy * 2 * srcPitch) + gx * 4 * 2;
     208
     209        uint8_t R[4];
     210        uint8_t G[4];
     211        uint8_t B[4];
     212        uint8_t j = 0;
     213
     214        R[0] = srcImage[si+%s];
     215        G[0] = srcImage[si+%s];
     216        B[0] = srcImage[si+%s];
     217        for (j=0; j<4; j++) {
     218            R[j] = R[0];
     219            G[j] = G[0];
     220            B[j] = B[0];
     221        }
     222
     223        //write up to 4 Y pixels:
     224        uint i = gy*2*strideY + gx*2;
     225        dstY[i] = __float2int_rn(0.257 * R[0] + 0.504 * G[0] + 0.098 * B[0] + 16);
     226        if (gx*2 + 1 < w) {
     227            R[1] = srcImage[si+4+%s];
     228            G[1] = srcImage[si+4+%s];
     229            B[1] = srcImage[si+4+%s];
     230            dstY[i+1] = __float2int_rn(0.257 * R[1] + 0.504 * G[1] + 0.098 * B[1] + 16);
     231        }
     232        if (gy*2 + 1 < h) {
     233            i += strideY;
     234            si += srcPitch;
     235            R[2] = srcImage[si+%s];
     236            G[2] = srcImage[si+%s];
     237            B[2] = srcImage[si+%s];
     238            dstY[i] = __float2int_rn(0.257 * R[2] + 0.504 * G[2] + 0.098 * B[2] + 16);
     239            if (gx*2 + 1 < w) {
     240                R[3] = srcImage[si+4+%s];
     241                G[3] = srcImage[si+4+%s];
     242                B[3] = srcImage[si+4+%s];
     243                dstY[i+1] = __float2int_rn(0.257 * R[3] + 0.504 * G[3] + 0.098 * B[3] + 16);
     244            }
     245        }
     246
     247        //write 1 U and 1 V pixel:
     248        float sumu = 0;
     249        float sumv = 0;
     250        for (j=0; j<4; j++) {
     251            sumu += -0.148 * R[j] - 0.291 * G[j] + 0.439 * B[j] + 128;
     252            sumv +=  0.439 * R[j] - 0.368 * G[j] - 0.071 * B[j] + 128;
     253        }
     254        U[(gy * strideU) + gx] = __float2int_rn( sumu / 4.0);
     255        V[(gy * strideV) + gx] = __float2int_rn( sumv / 4.0);
     256}
     257    """ % args
     258
     259
     260
     261RGB_to_YUV_generators = {
     262                    "YUV444P"   : gen_rgb_to_yuv444p_kernel,
     263                    "YUV422P"   : gen_rgb_to_yuv422p_kernel,
     264                    "YUV420P"   : gen_rgb_to_yuv420p_kernel,
     265                    }
     266
     267def gen_rgb_to_yuv_kernels(rgb_mode="RGBX", yuv_modes=YUV_FORMATS):
     268    RGB_to_YUV_KERNELS = {}
     269    for yuv in yuv_modes:
     270        gen = RGB_to_YUV_generators.get(yuv)
     271        assert gen is not None, "no generator found for yuv mode %s" % yuv
     272        RGB_to_YUV_KERNELS[(rgb_mode, yuv)] = gen(rgb_mode)
     273    return RGB_to_YUV_KERNELS
  • xpra/codecs/csc_nvcuda/colorspace_converter.py

     
    33# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
    44# later version. See the file COPYING for details.
    55
    6 
     6from xpra.codecs.csc_nvcuda.CUDA_kernels import gen_rgb_to_yuv_kernels
    77from xpra.codecs.image_wrapper import ImageWrapper
    88from xpra.codecs.codec_constants import codec_spec, get_subsampling_divs
    99from xpra.log import Logger, debug_if_env
     
    1111debug = debug_if_env(log, "XPRA_CUDA_DEBUG")
    1212error = log.error
    1313
     14import os
    1415import numpy
    1516import time
    16 import ctypes
    17 import sys
    1817assert bytearray
    19 import pycuda               #@UnresolvedImport
    20 from pycuda import driver   #@UnresolvedImport
     18import pycuda                               #@UnresolvedImport
     19from pycuda import driver                   #@UnresolvedImport
     20from pycuda.compiler import SourceModule    #@UnresolvedImport
    2121driver.init()
    2222
    2323def log_sys_info():
     
    2727def device_info(d):
    2828    return "%s @ %s" % (d.name(), d.pci_bus_id())
    2929
     30DEFAULT_CUDA_DEVICE_ID = int(os.environ.get("XPRA_CUDA_DEVICE", "0"))
     31
    3032def select_device():
    3133    ngpus = driver.Device.count()
    3234    log.info("PyCUDA found %s devices:", ngpus)
     
    4143        #debug("CAN_MAP_HOST_MEMORY=%s", host_mem)
    4244        #attr = d.get_attributes()
    4345        #debug("compute_capability=%s, attributes=%s", d.compute_capability(), attr)
    44         if host_mem and device is None:
     46        if host_mem and (device is None or i==DEFAULT_CUDA_DEVICE_ID):
    4547            device = d
    4648    return device
    4749assert select_device() is not None
     
    7678    context_wrapper = CudaContextWrapper(context)
    7779    context.pop()
    7880
    79 def find_lib(basename):
    80     try:
    81         if sys.platform == "win32":
    82             libname = basename+".dll"
    83         else:
    84             libname = basename+".so"
    85         return ctypes.cdll.LoadLibrary(libname)
    86     except Exception, e:
    87         debug("could not find %s: %s", basename, e)
    88         return None
    89 
    90 _NPP_LIBRARY_NAMES = ["libnppi",    #CUDA5.5
    91                       "libnpp"]     #CUDA5.0
    92 _NPP_LIBRARIES = []
    93 for name in _NPP_LIBRARY_NAMES:
    94     lib = find_lib(name)
    95     if lib:
    96         _NPP_LIBRARIES.append(lib)
    97 if len(_NPP_LIBRARIES)==0:
    98     raise ImportError("failed to load npp library - check your library path")
    99 
    100 #try to get the npp version:
    101 class NppLibraryVersion(ctypes.Structure):
    102     _fields_ = [("major", ctypes.c_int),
    103                 ("minor", ctypes.c_int),
    104                 ("build", ctypes.c_int)]
    105 try:
    106     nppGetLibVersion = None
    107     for lib in _NPP_LIBRARIES:
    108         if hasattr(lib, "nppGetLibVersion"):
    109             nppGetLibVersion = getattr(lib, "nppGetLibVersion")
    110     if nppGetLibVersion:
    111         nppGetLibVersion.argtypes = []
    112         nppGetLibVersion.restype = ctypes.POINTER(NppLibraryVersion)
    113         v = nppGetLibVersion().contents
    114         log.info("found npp library version %s.%s.%s", v.major, v.minor, v.build)
    115 except:
    116     log.warn("error getting npp version", exc_info=True)
    117 
    118 
    119 class NppiSize(ctypes.Structure):
    120     _fields_ = [("width", ctypes.c_int),
    121                 ("height", ctypes.c_int)]
    122 
    123 def Npp8u_p(buf):
    124     return ctypes.cast(int(buf), ctypes.c_void_p)
    125 
    126 RGB_to_YUV444P_argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, ctypes.c_int, NppiSize]
    127 RGB_to_YUV42xP_argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, (ctypes.c_int)*3, NppiSize]
    128 
    129 YUV444P_to_RGB_argtypes = [(ctypes.c_void_p)*3, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, NppiSize]
    130 YUV42xP_to_RGB_argtypes = [(ctypes.c_void_p)*3, (ctypes.c_int)*3, ctypes.c_void_p, ctypes.c_int, NppiSize]
    131 CONSTANT_ALPHA = ctypes.c_uint8
    132 
    133 
    134 COLORSPACES_MAP_STR  = {
    135         #("RGBX",    "YUV444P")  : ("nppiRGBToYCbCr_8u_C3P3R",       RGB_to_YUV444P_argtypes),
    136         ("RGBA",    "YUV444P")  : ("nppiRGBToYCbCr_8u_AC4P3R",      RGB_to_YUV444P_argtypes),
    137         #("YUV444P", "RGB")      : ("nppiYCbCrToRGB_8u_P3C3R",       YUV444P_to_RGB_argtypes),
    138         #("YUV444P", "BGR")      : ("nppiYCbCrToBGR_8u_P3C3R",       YUV444P_to_RGB_argtypes),
    139         ("YUV444P", "RGBX")     : ("nppiYCbCrToRGB_8u_P3C4R",       YUV444P_to_RGB_argtypes+[CONSTANT_ALPHA]),
    140         ("YUV444P", "BGRX")     : ("nppiYCbCrToBGR_8u_P3C4R",       YUV444P_to_RGB_argtypes+[CONSTANT_ALPHA]),
    141         #BGR / BGRA: need nppiSwap(Channels before one of the above
    142         #("RGBX",    "YUV422P")  : ("nppiRGBToYCbCr422_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
    143         #("BGRX",    "YUV422P")  : ("nppiBGRToYCbCr422_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
    144         ("BGRX",    "YUV422P")  : ("nppiBGRToYCbCr422_8u_AC4P3R",   RGB_to_YUV42xP_argtypes),
    145         #("YUV422P", "RGB")      : ("nppiYCbCr422ToRGB_8u_P3C3R",    YUV42xP_to_RGB_argtypes),
    146         #("YUV422P", "BGR")      : ("nppiYCbCr422ToBGR_8u_P3C3R",    YUV42xP_to_RGB_argtypes),
    147         #YUV420P:
    148         #("RGBX",    "YUV420P")  : ("nppiRGBToYCbCr420_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
    149         #("BGRX",    "YUV420P")  : ("nppiBGRToYCbCr420_8u_C3P3R",    RGB_to_YUV42xP_argtypes),
    150         ("RGBX",    "YUV420P")  : ("nppiRGBToYCrCb420_8u_AC4P3R",   RGB_to_YUV42xP_argtypes),
    151         ("BGRX",    "YUV420P")  : ("nppiBGRToYCbCr420_8u_AC4P3R",   RGB_to_YUV42xP_argtypes),
    152         #("YUV420P", "RGB")      : ("nppiYCbCr420ToRGB_8u_P3C3R",    YUV42xP_to_RGB_argtypes),
    153         #("YUV420P", "BGR")      : ("nppiYCbCr420ToBGR_8u_P3C3R",    YUV42xP_to_RGB_argtypes),
    154         #("YUV420P", "RGBX")     : ("nppiYCrCb420ToRGB_8u_P3C4R",    YUV42xP_to_RGB_argtypes),
    155         #("YUV420P", "BGRX")     : ("nppiYCbCr420ToBGR_8u_P3C4R",    YUV42xP_to_RGB_argtypes),
    156         }
    157 #ie:
    158 #BGR to YUV420P:
    159 #NppStatus nppiBGRToYCbCr420_8u_C3P3R (const Npp8u *pSrc, int nSrcStep, Npp8u *pDst[3], int rDstStep[3], NppiSize oSizeROI)
    160 #pSrc Source-Image Pointer.
    161 #nSrcStep Source-Image Line Step.
    162 #pDst Destination-Planar-Image Pointer Array.
    163 #rDstStep Destination-Planar-Image Line Step Array.
    164 #oSizeROI Region-of-Interest (ROI). (struct with width and height)
    165 #Returns:
    166 #Image Data Related Error Codes, ROI Related Error Codes
    167 
    168 #For YUV444P:
    169 #NppStatus nppiRGBToYCbCr_8u_C3P3R(const Npp8u * pSrc, int nSrcStep, Npp8u * pDst[3], int nDstStep, NppiSize oSizeROI);
    170 #(only one nDstStep!
    171 
    172 #YUV420P to RGB:
    173 #NppStatus nppiYCbCrToRGB_8u_P3C3R(const Npp8u * const pSrc[3], int nSrcStep, Npp8u * pDst, int nDstStep, NppiSize oSizeROI);
    174 #YUV444P to RGB:
    175 #NppStatus nppiYCrCb420ToRGB_8u_P3C4R(const Npp8u * const pSrc[3],int rSrcStep[3], Npp8u * pDst, int nDstStep, NppiSize oSizeROI);
    176 #Those with alpha add:
    177 #Npp8u nAval
    178 
    179 
    180 NPP_NO_OPERATION_WARNING = 1
    181 NPP_DIVIDE_BY_ZERO_WARNING = 6
    182 NPP_AFFINE_QUAD_INCORRECT_WARNING = 28
    183 NPP_WRONG_INTERSECTION_ROI_WARNING = 29
    184 NPP_WRONG_INTERSECTION_QUAD_WARNING = 30
    185 NPP_DOUBLE_SIZE_WARNING = 35
    186 NPP_MISALIGNED_DST_ROI_WARNING = 10000
    187 
    188 WARNINGS = {
    189     NPP_NO_OPERATION_WARNING :  "Indicates that no operation was performed",
    190     NPP_DIVIDE_BY_ZERO_WARNING: "Divisor is zero however does not terminate the execution",
    191     NPP_AFFINE_QUAD_INCORRECT_WARNING:  "Indicates that the quadrangle passed to one of affine warping functions doesn't have necessary properties. First 3 vertices are used, the fourth vertex discarded",
    192     NPP_WRONG_INTERSECTION_ROI_WARNING: "The given ROI has no interestion with either the source or destination ROI. Thus no operation was performed",
    193     NPP_WRONG_INTERSECTION_QUAD_WARNING:"The given quadrangle has no intersection with either the source or destination ROI. Thus no operation was performed",
    194     NPP_DOUBLE_SIZE_WARNING:    "Image size isn't multiple of two. Indicates that in case of 422/411/420 sampling the ROI width/height was modified for proper processing",
    195     NPP_MISALIGNED_DST_ROI_WARNING: "Speed reduction due to uncoalesced memory accesses warning"
    196     }
    197 NPP_STEP_ERROR = -14
    198 NPP_NOT_EVEN_STEP_ERROR = -108
    199 
    200 ERRORS = {
    201     NPP_STEP_ERROR : "Step is less or equal zero",
    202     NPP_NOT_EVEN_STEP_ERROR :   "Step value is not pixel multiple",
    203           }
    204 
    205 
    206 YUV_INDEX_TO_PLANE = {
    207                       0 : "Y",
    208                       1 : "U",
    209                       2 : "V"
    210                       }
    211 
    212 
    21381def roundup(n, m):
    21482    return (n + m - 1) & ~(m - 1)
    21583
    21684
    217 COLORSPACES_MAP = {}
    218 for k, f_def in COLORSPACES_MAP_STR.items():
    219     fn, argtypes = f_def
    220     try:
    221         for lib in _NPP_LIBRARIES:
    222             if hasattr(lib, fn):
    223                 cfn = getattr(lib, fn)
    224                 debug("found %s for %s in %s: %s", fn, k, lib, cfn)
    225                 COLORSPACES_MAP[k] = (fn, cfn)
    226                 #set argument types and return type:
    227                 cfn.restype = ctypes.c_int
    228                 cfn.argtypes = argtypes
    229     except:
    230         log.error("could not load '%s', conversion disabled: %s", fn, k)
     85COLORSPACES_MAP = {
     86                   "BGRA" : ("YUV420P", "YUV422P", "YUV444P"),
     87                   "BGRX" : ("YUV420P", "YUV422P", "YUV444P"),
     88                   "RGBA" : ("YUV420P", "YUV422P", "YUV444P"),
     89                   "RGBX" : ("YUV420P", "YUV422P", "YUV444P"),
     90                   }
     91KERNELS_MAP = {}
     92for rgb_format, yuv_formats in COLORSPACES_MAP.items():
     93    m = gen_rgb_to_yuv_kernels(rgb_format, yuv_formats)
     94    KERNELS_MAP.update(m)
     95log.info("csc_nvcuda kernels: %s", KERNELS_MAP)
    23196
    232 
    23397def get_type():
    23498    return "nvcuda"
    23599
     
    237101    return pycuda.VERSION_TEXT
    238102
    239103def get_input_colorspaces():
    240     return sorted(set([src for src, _ in COLORSPACES_MAP.keys()]))
     104    return sorted(COLORSPACES_MAP.keys())
    241105
    242106def get_output_colorspaces(input_colorspace):
    243     return sorted(set(dst for src,dst in COLORSPACES_MAP.keys() if src==input_colorspace))
     107    return sorted(COLORSPACES_MAP.get(input_colorspace))
    244108
    245109def validate_in_out(in_colorspace, out_colorspace):
    246110    assert in_colorspace in get_input_colorspaces(), "invalid input colorspace: %s (must be one of %s)" % (in_colorspace, get_input_colorspaces())
     
    262126        self.dst_format = ""
    263127        self.time = 0
    264128        self.frames = 0
     129        self.cuda_device = None
     130        self.cuda_context = None
     131        self.max_block_sizes = 0
     132        self.max_grid_sizes = 0
     133        self.max_threads_per_block = 0
    265134        self.kernel_function = None
     135        self.kernel_function_name = None
    266136        self.context = None
    267137
    268138    def init_context(self, src_width, src_height, src_format,
     
    276146        self.dst_height = dst_height
    277147        self.dst_format = dst_format
    278148        self.context = context
    279         k = (src_format, dst_format)
    280         npp_fn = COLORSPACES_MAP.get(k)
    281         assert npp_fn is not None, "invalid pair: %s" % k
    282         self.kernel_function_name, cfn = npp_fn
    283         debug("init_context%s npp conversion function=%s (%s)", (src_width, src_height, src_format, dst_width, dst_height, dst_format), self.kernel_function_name, cfn)
    284         self.kernel_function = cfn
    285         if src_format.find("YUV")>=0:
    286             self.convert_image_fn = self.convert_image_yuv
    287         else:
    288             self.convert_image_fn = self.convert_image_rgb
     149        assert self.src_width==self.dst_width and self.src_height==self.dst_height, "scaling is not supported! (%sx%s to %sx%s)" % (self.src_width, self.src_height, self.dst_width, self.dst_height)
     150
     151        self.init_cuda(0)
     152
     153    def init_cuda(self, device_id):
     154        debug("init_cuda(%s)", device_id)
     155        self.cuda_device = driver.Device(DEFAULT_CUDA_DEVICE_ID)
     156        self.cuda_context = self.cuda_device.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST)
     157        #use alias to make code easier to read:
     158        d = self.cuda_device
     159        da = driver.device_attribute
     160        try:
     161            debug("init_cuda(%s) cuda_device=%s, cuda_context=%s", device_id, self.cuda_device, self.cuda_context)
     162            #compile/get kernel:
     163            key = self.src_format, self.dst_format
     164            k = KERNELS_MAP.get(key)
     165            assert k is not None, "kernel not found for %s" % str(key)
     166            self.kernel_function_name, ksrc = k
     167            debug("init_cuda(%s) found kernel %s: %s", self.kernel_function_name, ksrc)
     168            mod = SourceModule(ksrc)
     169            self.kernel_function = mod.get_function(self.kernel_function_name)
     170
     171            self.max_block_sizes = d.get_attribute(da.MAX_BLOCK_DIM_X), d.get_attribute(da.MAX_BLOCK_DIM_Y), d.get_attribute(da.MAX_BLOCK_DIM_Z)
     172            self.max_grid_sizes = d.get_attribute(da.MAX_GRID_DIM_X), d.get_attribute(da.MAX_GRID_DIM_Y), d.get_attribute(da.MAX_GRID_DIM_Z)
     173            debug("max_block_sizes=%s", self.max_block_sizes)
     174            debug("max_grid_sizes=%s", self.max_grid_sizes)
     175
     176            self.max_threads_per_block = self.BGRA2NV12.get_attribute(driver.function_attribute.MAX_THREADS_PER_BLOCK)
     177            debug("max_threads_per_block=%s", self.max_threads_per_block)
     178
     179            self.init_nvenc()
     180        finally:
     181            self.cuda_context.pop()
     182
     183        self.convert_image_fn = self.convert_image_rgb
    289184        debug("init_context(..) convert_image=%s", self.convert_image)
    290185
    291186    def get_info(self):
     
    348243        finally:
    349244            self.context.pop()
    350245
    351     def convert_image_yuv(self, image):
    352         global program
    353         start = time.time()
    354         iplanes = image.get_planes()
    355         width = image.get_width()
    356         height = image.get_height()
    357         strides = image.get_rowstride()
    358         pixels = image.get_pixels()
    359         debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels))
    360         assert iplanes==ImageWrapper._3_PLANES, "must use planar YUV as input"
    361         assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format)
    362         assert len(strides)==len(pixels)==3, "invalid number of planes (%s) or strides (%s), should be 3" % (len(strides), len(pixels))
    363 
    364         #YUV444P argtypes = [(ctypes.c_void_p)*3, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, NppiSize]
    365         #YUV42xP argtypes = [(ctypes.c_void_p)*3, (ctypes.c_int)*3, ctypes.c_void_p, ctypes.c_int, NppiSize]
    366         in_t = self.kernel_function.argtypes[0]            #always: (ctypes.c_void_p)*3
    367         in_strides_t = self.kernel_function.argtypes[1]    #(ctypes.c_int)*3 OR ctypes.c_int
    368 
    369         divs = get_subsampling_divs(self.src_format)
    370 
    371         stream = driver.Stream()
    372         #copy each plane to the GPU:
    373         upload_start = time.time()
    374         locked_mem = []         #reference to pinned memory
    375         in_bufs = []            #GPU side yuv channels
    376         in_strides = []         #GPU side strides
    377         for i in range(3):
    378             x_div, y_div = divs[i]
    379             stride = strides[i]
    380             assert stride >= width/x_div, \
    381                 "invalid stride %s is smaller than plane %s width %s/%s" % (stride, YUV_INDEX_TO_PLANE.get(i, i), width, x_div)
    382             in_height = height/y_div
    383             plane = pixels[i]
    384             assert len(plane)>=stride*in_height
    385 
    386             mem = numpy.frombuffer(plane, dtype=numpy.byte)
    387             if True:
    388                 #keeping stride as it is:
    389                 in_buf = driver.mem_alloc(len(plane))
    390                 in_bufs.append(in_buf)
    391                 in_strides.append(stride)
    392                 hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP)
    393                 pycuda.driver.memcpy_htod_async(in_buf, mem, stream)
    394             else:
    395                 #change stride to what we get from mem_alloc_pitch:
    396                 in_buf, in_stride = driver.mem_alloc_pitch(stride, in_height, 4)
    397                 in_bufs.append(in_buf)
    398                 in_strides.append(in_stride)
    399                 hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP)
    400                 locked_mem.append(hmem)
    401                 copy = driver.Memcpy2D()
    402                 copy.set_src_host(hmem)
    403                 copy.set_dst_device(in_buf)
    404                 copy.src_pitch = stride
    405                 copy.dst_pitch = in_stride
    406                 copy.width_in_bytes = stride
    407                 copy.height = in_height
    408                 copy(stream)
    409         stream.synchronize()
    410         #all the copying is complete, we can unpin the host memory:
    411         for hmem in locked_mem:
    412             hmem.base.unregister()
    413         upload_end = time.time()
    414         debug("%s pixels now on GPU at %s, took %.1fms", sum([len(plane) for plane in pixels]), in_bufs, upload_end-upload_start)
    415 
    416         #allocate output RGB buffer on CPU:
    417         out_buf, out_stride = driver.mem_alloc_pitch(width*4, height, 4)
    418         src = in_t(*[Npp8u_p(in_buf) for in_buf in in_bufs])
    419         if in_strides_t==ctypes.c_int:
    420             #one stride for all planes (this must be YUV444P)
    421             assert len(set(in_strides))==1, "expected only one stride: %s" % str(in_strides)
    422             in_strides = [in_strides[0]]
    423         debug("in_strides=%s, out_stride=%s", in_strides, out_stride)
    424         kargs = [src, in_strides_t(*in_strides), Npp8u_p(out_buf), ctypes.c_int(out_stride), NppiSize(width, height)]
    425         if self.kernel_function.argtypes[-1]==CONSTANT_ALPHA:
    426             #add hardcoded constant alpha:
    427             kargs.append(ctypes.c_uint8(255))
    428         debug("calling %s%s", self.kernel_function_name, tuple(kargs))
    429         kstart = time.time()
    430         v = self.kernel_function(*kargs)
    431         #we can now free the GPU source planes:
    432         for in_buf in in_bufs:
    433             in_buf.free()
    434         if v<0:
    435             log.error("%s%s returned an error: %s", self.kernel_function_name, kargs, ERRORS.get(v, v))
    436             return None
    437         elif v>0 and v!=NPP_DOUBLE_SIZE_WARNING:
    438             #positive return-codes indicate warnings:
    439             warning = WARNINGS.get(v, "unknown")
    440             log.warn("%s returned a warning %s: %s", self.kernel_function_name, v, warning)
    441         kend = time.time()
    442         debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0)
    443 
    444         self.frames += 1
    445 
    446         read_start = time.time()
    447         gpu_size = out_stride*height
    448         min_size = width*4*height
    449         if gpu_size<=2*min_size:
    450             #direct full buffer async copy with GPU padding:
    451             pixels = driver.pagelocked_empty(stride*height, dtype=numpy.byte)
    452             driver.memcpy_dtoh_async(pixels, out_buf, stream)
    453         else:
    454             #we don't want the crazy large GPU padding, so we do it ourselves:
    455             stride = width*4
    456             pixels = driver.pagelocked_empty(stride*height, dtype=numpy.byte)
    457             copy = driver.Memcpy2D()
    458             copy.set_src_device(out_buf)
    459             copy.set_dst_host(pixels)
    460             copy.src_pitch = out_stride
    461             copy.dst_pitch = stride
    462             copy.width_in_bytes = width*4
    463             copy.height = height
    464             copy(stream)
    465         stream.synchronize()
    466 
    467         #the pixels have been copied, we can free the GPU output memory:
    468         out_buf.free()
    469         self.context.synchronize()
    470         read_end = time.time()
    471         debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start))
    472         return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels.data, self.dst_format, 24, out_stride, planes=ImageWrapper.PACKED)
    473 
    474 
    475246    def convert_image_rgb(self, image):
    476247        global program
    477248        start = time.time()
    478249        iplanes = image.get_planes()
    479         width = image.get_width()
    480         height = image.get_height()
     250        w = image.get_width()
     251        h = image.get_height()
    481252        stride = image.get_rowstride()
    482253        pixels = image.get_pixels()
    483254        debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels))
    484255        assert iplanes==ImageWrapper.PACKED, "must use packed format as input"
    485256        assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format)
    486 
    487257        divs = get_subsampling_divs(self.dst_format)
    488258
    489259        #copy packed rgb pixels to GPU:
    490260        upload_start = time.time()
    491261        stream = driver.Stream()
    492262        mem = numpy.frombuffer(pixels, dtype=numpy.byte)
    493         #keeping stride as it is:
    494         #the non async/pinned version is simple but slower:
    495         # gpu_image = driver.to_device(pixels)
    496         #followed by:
    497         # gpu_image.free()
    498263        in_buf = driver.mem_alloc(len(pixels))
    499264        hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP)
    500265        pycuda.driver.memcpy_htod_async(in_buf, mem, stream)
    501266
    502         #YUV444P argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, ctypes.c_int, NppiSize]
    503         #YUV42xP argtypes = [ctypes.c_void_p, ctypes.c_int, (ctypes.c_void_p)*3, (ctypes.c_int)*3, NppiSize]
    504         out_t = self.kernel_function.argtypes[2]            #always: (ctypes.c_void_p)*3
    505         out_strides_t = self.kernel_function.argtypes[3]    #(ctypes.c_int)*3 OR ctypes.c_int
    506267        out_bufs = []
    507268        out_strides = []
    508269        out_sizes = []
    509270        for i in range(3):
    510271            x_div, y_div = divs[i]
    511             out_stride = roundup(width/x_div, 4)
    512             out_height = roundup(height/y_div, 2)
     272            out_stride = roundup(self.dst_width/x_div, 4)
     273            out_height = roundup(self.dst_height/y_div, 2)
    513274            out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4)
    514275            out_bufs.append(out_buf)
    515276            out_strides.append(out_stride)
    516277            out_sizes.append((out_stride, out_height))
    517         dest = out_t(*[ctypes.cast(int(out_buf), ctypes.c_void_p) for out_buf in out_bufs])
    518         if out_strides_t==ctypes.c_int:
    519             #one stride for all planes (this must be YUV444P)
    520             assert len(set(out_strides))==1, "more than one stride where only one expected in: %s" % out_strides
    521             out_strides = [out_strides[0]]
    522         kargs = [Npp8u_p(in_buf), ctypes.c_int(stride), dest, out_strides_t(*out_strides), NppiSize(width, height)]
    523         #ensure copying has finished:
     278        #ensure uploading has finished:
    524279        stream.synchronize()
    525280        #we can now unpin the host memory:
    526281        hmem.base.unregister()
    527         debug("allocation took %.1fms", 1000.0*(time.time() - upload_start))
     282        debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start))
    528283
    529         debug("calling %s%s", self.kernel_function_name, tuple(kargs))
    530284        kstart = time.time()
    531         v = self.kernel_function(*kargs)
     285        kargs = [in_buf, stride]
     286        for i in range(3):
     287            kargs.append(out_bufs[i])
     288            kargs.append(numpy.int32(out_strides[i]))
     289        blockw, blockh = 16, 16
     290        #divide each dimension by 2 since we process 4 pixels at a time:
     291        gridw = max(1, w/blockw/2)
     292        if gridw*2*blockw<w:
     293            gridw += 1
     294        gridh = max(1, h/blockh/2)
     295        if gridh*2*blockh<h:
     296            gridh += 1
     297        debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh))
     298        self.kernel_function(*kargs,
     299               block=(blockw,blockh,1), grid=(gridw, gridh))
     300
    532301        #we can now free the GPU source buffer:
    533302        in_buf.free()
    534         if v<0:
    535             log.error("%s%s returned an error: %s", self.kernel_function_name, kargs, ERRORS.get(v, v))
    536             return None
    537         elif v>0 and v!=NPP_DOUBLE_SIZE_WARNING:
    538             #positive return-codes indicate warnings:
    539             warning = WARNINGS.get(v, "unknown")
    540             log.warn("%s returned a warning %s: %s", self.kernel_function_name, v, warning)
    541303        kend = time.time()
    542304        debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0)
    543305        self.frames += 1