xpra icon
Bug tracker and wiki

This bug tracker and wiki are being discontinued
please use https://github.com/Xpra-org/xpra instead.


Ticket #466: nvenc-yuv444p.patch

File nvenc-yuv444p.patch, 35.7 KB (added by Antoine Martin, 7 years ago)

YUV444 for NVENC: using 3 pass encoding (one for each of Y, U and V)

  • xpra/client/ui_client_base.py

     
    686686            "encoding.video_subregion"  : True,
    687687            "encoding.video_reinit"     : True,
    688688            "encoding.video_scaling"    : True,
     689            "encoding.video_separateplane"  : True,
    689690            "encoding.rgb_lz4"          : use_lz4 and self.compression_level==1,
    690691            "encoding.transparency"     : self.has_transparency(),
    691692            #TODO: check for csc support (swscale only?)
  • xpra/codecs/dec_avcodec2/decoder.pyx

     
    218218    """
    219219
    220220    def __str__(self):                          #@DuplicatedSignature
    221         return ImageWrapper.__str__(self)+"-(%s)" % self.av_frame
     221        return ImageWrapper.__str__(self)+"-(%s)" % self.av_frames
    222222
    223223    def free(self):                             #@DuplicatedSignature
    224224        log("AVImageWrapper.free()")
     
    231231        self.xpra_free_frame()
    232232
    233233    def xpra_free_frame(self):
    234         log("AVImageWrapper.xpra_free_frame() av_frame=%s", self.av_frame)
    235         if self.av_frame:
    236             self.av_frame.xpra_free()
    237             self.av_frame = None
     234        log("AVImageWrapper.xpra_free_frame() av_frames=%s", self.av_frames)
     235        if self.av_frames:
     236            for av_frame in self.av_frames:
     237                av_frame.xpra_free()
     238            self.av_frames = None
    238239
    239240
    240 
    241241cdef class Decoder:
    242242    """
    243243        This wraps the AVCodecContext and its configuration,
     
    414414        cdef unsigned char * padded_buf = NULL
    415415        cdef const unsigned char * buf = NULL
    416416        cdef Py_ssize_t buf_len = 0
     417        cdef int offset
     418        cdef int size
    417419        cdef int len = 0
     420        cdef int step, steps
    418421        cdef int got_picture
    419422        cdef AVPacket avpkt
    420423        cdef unsigned long frame_key                #@DuplicatedSignature
    421424        cdef AVFrameWrapper framewrapper
    422425        cdef object img
     426        cdef object plane_offsets, plane_sizes
    423427        assert self.codec_ctx!=NULL
    424428        assert self.codec!=NULL
    425         #copy input buffer into padded C buffer:
     429
     430        #copy whole input buffer into padded C buffer:
    426431        PyObject_AsReadBuffer(input, <const void**> &buf, &buf_len)
    427432        padded_buf = <unsigned char *> xmemalign(buf_len+128)
    428433        memcpy(padded_buf, buf, buf_len)
    429434        memset(padded_buf+buf_len, 0, 128)
    430         #ensure we can detect if the frame buffer got allocated:
    431         clear_frame(self.frame)
    432         #now safe to run without gil:
    433         with nogil:
    434             av_init_packet(&avpkt)
    435             avpkt.data = <uint8_t *> padded_buf
    436             avpkt.size = buf_len
    437             len = avcodec_decode_video2(self.codec_ctx, self.frame, &got_picture, &avpkt)
    438             free(padded_buf)
    439         if len < 0: #for testing add: or options.get("frame", 0)%100==99:
    440             self.frame_error()
    441             log.warn("%s.decompress_image(%s:%s, %s) avcodec_decode_video2 failure: %s", self, type(input), buf_len, options, self.av_error_str(len))
    442             return None
    443             #raise Exception("avcodec_decode_video2 failed to decode this frame and returned %s, decoder=%s" % (len, self.get_info()))
    444435
    445         if self.actual_pix_fmt!=self.frame.format:
    446             self.actual_pix_fmt = self.frame.format
    447             if self.actual_pix_fmt not in ENUM_TO_FORMAT:
     436        #support for separate colour planes:
     437        #(each plane is sent in its own frame)
     438        plane_sizes = options.get("plane_sizes")
     439        log.info("plane_sizes=%s", plane_sizes)
     440        if plane_sizes is not None:
     441            #this is the separate colour plane format
     442            #each loop will only collect one plane:
     443            steps = 3
     444            nplanes = 1
     445            #the actual format is fixed:
     446            assert options.get("csc")=="YUV444P"
     447            self.actual_pix_fmt = FORMAT_TO_ENUM["YUV444P"]
     448            plane_offsets = [0, plane_sizes[0], plane_sizes[0]+plane_sizes[1]]
     449        else:
     450            #regular format (not using separate colour planes)
     451            #either RGB or YUV output
     452            #for YUV output, we need 3 planes:
     453            steps = 1
     454            nplanes = 3
     455            plane_offsets = [0]
     456            plane_sizes = [buf_len]
     457        #note: plain RGB output, will redefine those:
     458        out = []
     459        strides = []
     460        outsize = 0
     461        framewrappers = []
     462        log.info("steps=%s, plane_offsets=%s, plane_sizes=%s", steps, plane_offsets, plane_sizes)
     463        for step in range(steps):
     464            offset = plane_offsets[step]
     465            size = plane_sizes[step]
     466
     467            #ensure we can detect if the frame buffer got allocated:
     468            clear_frame(self.frame)
     469            #now safe to run without gil:
     470            with nogil:
     471                av_init_packet(&avpkt)
     472                avpkt.data = <uint8_t *> (padded_buf+offset)
     473                avpkt.size = size
     474                len = avcodec_decode_video2(self.codec_ctx, self.frame, &got_picture, &avpkt)
     475            if len<0:
    448476                self.frame_error()
    449                 raise Exception("unknown output pixel format: %s, expected %s (%s)" % (self.actual_pix_fmt, self.pix_fmt, self.colorspace))
    450             log("avcodec actual output pixel format is %s (%s), expected %s (%s)", self.actual_pix_fmt, self.get_actual_colorspace(), self.pix_fmt, self.colorspace)
     477                log.warn("%s.decompress_image(%s:%s, %s) avcodec_decode_video2 failure: %s", self, type(input), buf_len, options, self.av_error_str(len))
     478                return None
    451479
    452         #print("decompress image: colorspace=%s / %s" % (self.colorspace, self.get_colorspace()))
    453         cs = self.get_actual_colorspace()
    454         if cs.endswith("P"):
    455             out = []
    456             strides = []
    457             outsize = 0
    458             divs = get_subsampling_divs(cs)
    459             nplanes = 3
    460             for i in range(nplanes):
    461                 _, dy = divs[i]
    462                 if dy==1:
    463                     height = self.codec_ctx.height
    464                 elif dy==2:
    465                     height = (self.codec_ctx.height+1)>>1
    466                 else:
    467                     self.frame_error()
    468                     raise Exception("invalid height divisor %s" % dy)
    469                 stride = self.frame.linesize[i]
    470                 size = height * stride
    471                 outsize += size
     480            if steps==1:
     481                if self.actual_pix_fmt!=self.frame.format:
     482                    self.actual_pix_fmt = self.frame.format
     483                    if self.actual_pix_fmt not in ENUM_TO_FORMAT:
     484                        self.frame_error()
     485                        raise Exception("unknown output pixel format: %s, expected %s (%s)" % (self.actual_pix_fmt, self.pix_fmt, self.colorspace))
     486                    log("avcodec actual output pixel format is %s (%s), expected %s (%s)", self.actual_pix_fmt, self.get_actual_colorspace(), self.pix_fmt, self.colorspace)
     487
     488            cs = self.get_actual_colorspace()
     489            if cs.endswith("P"):
     490                divs = get_subsampling_divs(cs)
     491                for i in range(nplanes):
     492                    _, dy = divs[i]
     493                    if dy==1:
     494                        height = self.codec_ctx.height
     495                    elif dy==2:
     496                        height = (self.codec_ctx.height+1)>>1
     497                    else:
     498                        self.frame_error()
     499                        raise Exception("invalid height divisor %s" % dy)
     500                    stride = self.frame.linesize[i]
     501                    size = height * stride
     502                    outsize += size
     503                    if READ_ONLY:
     504                        plane = PyBuffer_FromMemory(<void *>self.frame.data[i], size)
     505                    else:
     506                        plane = PyBuffer_FromReadWriteMemory(<void *>self.frame.data[i], size)
     507                    out.append(plane)
     508                    strides.append(stride)
     509            else:
     510                #RGB mode: "out" is a single buffer
     511                strides = self.frame.linesize[0]+self.frame.linesize[1]+self.frame.linesize[2]
     512                outsize = self.codec_ctx.height * strides
    472513                if READ_ONLY:
    473                     plane = PyBuffer_FromMemory(<void *>self.frame.data[i], size)
     514                    out = PyBuffer_FromMemory(<void *>self.frame.data[0], outsize)
    474515                else:
    475                     plane = PyBuffer_FromReadWriteMemory(<void *>self.frame.data[i], size)
    476                 out.append(plane)
    477                 strides.append(stride)
    478         else:
    479             strides = self.frame.linesize[0]+self.frame.linesize[1]+self.frame.linesize[2]
    480             outsize = self.codec_ctx.height * strides
    481             if READ_ONLY:
    482                 out = PyBuffer_FromMemory(<void *>self.frame.data[0], outsize)
    483             else:
    484                 out = PyBuffer_FromReadWriteMemory(<void *>self.frame.data[0], outsize)
    485             nplanes = 0
    486         if outsize==0:
    487             self.frame_error()
    488             raise Exception("output size is zero!")
     516                    out = PyBuffer_FromReadWriteMemory(<void *>self.frame.data[0], outsize)
     517                nplanes = 0
     518
     519            #FIXME: we could lose track of framewrappers if an error occurs before the end:
     520            framewrapper = AVFrameWrapper()
     521            framewrapper.set_context(self.codec_ctx, self.frame)
     522            framewrappers.append(framewrapper)
     523
     524            if outsize==0:
     525                self.frame_error()
     526                raise Exception("output size is zero!")
     527
     528        free(padded_buf)
    489529        assert self.codec_ctx.width>=self.width, "codec width is smaller than our width: %s<%s" % (self.codec_ctx.width, self.width)
    490530        assert self.codec_ctx.height>=self.height, "codec height is smaller than our height: %s<%s" % (self.codec_ctx.height, self.height)
    491         img = AVImageWrapper(0, 0, self.width, self.height, out, cs, 24, strides, nplanes)
    492         img.av_frame = None
    493         framewrapper = AVFrameWrapper()
    494         framewrapper.set_context(self.codec_ctx, self.frame)
    495         img.av_frame = framewrapper
     531        img = AVImageWrapper(0, 0, self.width, self.height, out, cs, 24, strides, nplanes*steps)
     532        img.av_frames = framewrappers
    496533        self.frames += 1
    497534        #add to weakref list after cleaning it up:
    498535        self.weakref_images = [x for x in self.weakref_images if x() is not None]
    499         ref = weakref.ref(img)
    500         self.weakref_images.append(ref)
     536        self.weakref_images.append(weakref.ref(img))
    501537        log("%s.decompress_image(%s:%s, %s)=%s", self, type(input), buf_len, options, img)
     538        log("out=%s, strides=%s", out, strides)
    502539        return img
    503540
    504541
  • xpra/codecs/nvenc/CUDA_rgb2yuv444p.py

     
    33# Xpra is released under the terms of the GNU GPL v2, or, at your option, any
    44# later version. See the file COPYING for details.
    55
    6 
     6#no longer used, but good to keep as reference:
    77BGRA2YUV444P_kernel = """
    88#include <stdint.h>
    99
     
    3737        dstImage[di] = __float2int_rn(0.439 * R - 0.368 * G - 0.071 * B + 128);
    3838    }
    3939}
     40"""
     41
     42
     43BGRA2Y_kernel = """
     44#include <stdint.h>
     45
     46__global__ void BGRA2Y(uint8_t *srcImage, int src_w, int src_h, int srcPitch,
     47                       uint8_t *dstImage, int dst_w, int dst_h, int dstPitch,
     48                       int w, int h)
     49{
     50    uint32_t gx, gy;
     51    gx = blockIdx.x * blockDim.x + threadIdx.x;
     52    gy = blockIdx.y * blockDim.y + threadIdx.y;
     53
     54    uint32_t src_y = gy * src_h / dst_h;
     55    uint32_t src_x = gx * src_w / dst_w;
     56
     57    if ((src_x < w) & (src_y < h)) {
     58        uint8_t R;
     59        uint8_t G;
     60        uint8_t B;
     61        //one 32-bit RGB pixel at a time:
     62        uint32_t si = (src_y * srcPitch) + src_x * 4;
     63        R = srcImage[si+2];
     64        G = srcImage[si+1];
     65        B = srcImage[si];
     66
     67        dstImage[(gy * dstPitch) + gx] = __float2int_rn(0.257 * R + 0.504 * G + 0.098 * B + 16);
     68    }
     69}
     70"""
     71
     72
     73BGRA2U_kernel = """
     74#include <stdint.h>
     75
     76__global__ void BGRA2U(uint8_t *srcImage, int src_w, int src_h, int srcPitch,
     77                       uint8_t *dstImage, int dst_w, int dst_h, int dstPitch,
     78                       int w, int h)
     79{
     80    uint32_t gx, gy;
     81    gx = blockIdx.x * blockDim.x + threadIdx.x;
     82    gy = blockIdx.y * blockDim.y + threadIdx.y;
     83
     84    uint32_t src_y = gy * src_h / dst_h;
     85    uint32_t src_x = gx * src_w / dst_w;
     86
     87    if ((src_x < w) & (src_y < h)) {
     88        uint8_t R;
     89        uint8_t G;
     90        uint8_t B;
     91        //one 32-bit RGB pixel at a time:
     92        uint32_t si = (src_y * srcPitch) + src_x * 4;
     93        R = srcImage[si+2];
     94        G = srcImage[si+1];
     95        B = srcImage[si];
     96
     97        dstImage[(gy * dstPitch) + gx] = __float2int_rn(-0.148 * R - 0.291 * G + 0.439 * B + 128);
     98    }
     99}
     100"""
     101
     102
     103BGRA2V_kernel = """
     104#include <stdint.h>
     105
     106__global__ void BGRA2V(uint8_t *srcImage, int src_w, int src_h, int srcPitch,
     107                       uint8_t *dstImage, int dst_w, int dst_h, int dstPitch,
     108                       int w, int h)
     109{
     110    uint32_t gx, gy;
     111    gx = blockIdx.x * blockDim.x + threadIdx.x;
     112    gy = blockIdx.y * blockDim.y + threadIdx.y;
     113
     114    uint32_t src_y = gy * src_h / dst_h;
     115    uint32_t src_x = gx * src_w / dst_w;
     116
     117    if ((src_x < w) & (src_y < h)) {
     118        uint8_t R;
     119        uint8_t G;
     120        uint8_t B;
     121        //one 32-bit RGB pixel at a time:
     122        uint32_t si = (src_y * srcPitch) + src_x * 4;
     123        R = srcImage[si+2];
     124        G = srcImage[si+1];
     125        B = srcImage[si];
     126
     127        dstImage[(gy * dstPitch) + gx] = __float2int_rn(0.439 * R - 0.368 * G - 0.071 * B + 128);
     128    }
     129}
    40130"""
     131 No newline at end of file
  • xpra/codecs/nvenc/encoder.pyx

     
    1818                record_device_failure, record_device_success
    1919from xpra.codecs.codec_constants import codec_spec, TransientCodecException
    2020from xpra.codecs.image_wrapper import ImageWrapper
     21from xpra.codecs.nvenc.CUDA_rgb2yuv444p import BGRA2Y_kernel, BGRA2U_kernel, BGRA2V_kernel
     22from xpra.codecs.nvenc.CUDA_rgb2nv12 import BGRA2NV12_kernel
     23
    2124from xpra.log import Logger
    2225log = Logger("encoder", "nvenc")
    2326
     
    3336    MIN_COMPUTE = 0
    3437
    3538
    36 #API is undocumented and broken:
    37 USE_YUV444P = False
    38 
    39 
    4039cdef extern from "Python.h":
    4140    ctypedef int Py_ssize_t
    4241    int PyObject_AsWriteBuffer(object obj,
     
    11201119cpdef get_CUDA_CSC_function(device_id, function_name, kernel_source):
    11211120    return function_name, get_CUDA_function(device_id, function_name, kernel_source)
    11221121
    1123 cpdef get_BGRA2YUV444P(device_id):
    1124     from xpra.codecs.nvenc.CUDA_rgb2yuv444p import BGRA2YUV444P_kernel
    1125     return get_CUDA_CSC_function(device_id, "BGRA2YUV444P", BGRA2YUV444P_kernel)
    11261122
     1123cpdef get_BGRA2Y(device_id):
     1124    return get_CUDA_CSC_function(device_id, "BGRA2Y", BGRA2Y_kernel)
     1125
     1126cpdef get_BGRA2U(device_id):
     1127    return get_CUDA_CSC_function(device_id, "BGRA2U", BGRA2U_kernel)
     1128
     1129cpdef get_BGRA2V(device_id):
     1130    return get_CUDA_CSC_function(device_id, "BGRA2V", BGRA2V_kernel)
     1131
     1132
    11271133cpdef get_BGRA2NV12(device_id):
    1128     from xpra.codecs.nvenc.CUDA_rgb2nv12 import BGRA2NV12_kernel
    11291134    return get_CUDA_CSC_function(device_id, "BGRA2NV12", BGRA2NV12_kernel)
    11301135
    11311136
     
    11361141    cdef int input_height
    11371142    cdef int encoder_width
    11381143    cdef int encoder_height
     1144    cdef int separate_plane
    11391145    cdef object src_format
    11401146    cdef object scaling
    11411147    cdef int speed
     
    11461152    cdef object cuda_device_info
    11471153    cdef object cuda_device
    11481154    cdef object cuda_context
    1149     cdef object kernel
    1150     cdef object kernel_name
     1155    cdef object kernels
     1156    cdef object kernel_names
    11511157    cdef object max_block_sizes
    11521158    cdef object max_grid_sizes
    11531159    cdef int max_threads_per_block
     
    11711177    #statistics, etc:
    11721178    cdef double time
    11731179    cdef int frames
     1180    cdef int index
    11741181    cdef object last_frame_times
    11751182    cdef long long bytes_in
    11761183    cdef long long bytes_out
     
    12101217
    12111218    def init_context(self, int width, int height, src_format, encoding, int quality, int speed, scaling, options={}):    #@DuplicatedSignature
    12121219        assert encoding in get_encodings(), "invalid encoding %s" % encoding
    1213         log("init_context%s", (width, height, src_format, encoding, quality, speed, scaling, options))
     1220        log.info("init_context%s", (width, height, src_format, encoding, quality, speed, scaling, options))
    12141221        self.width = width
    12151222        self.height = height
    12161223        self.speed = speed
     
    12251232        self.codec_name = "H264"
    12261233        self.preset_name = None
    12271234        self.frames = 0
     1235        self.index = 0
    12281236        self.cuda_device = None
    12291237        self.cuda_context = None
     1238        self.separate_plane = options.get("video_separateplane", False)
    12301239        self.pixel_format = ""
    12311240        self.last_frame_times = maxdeque(200)
    12321241        start = time.time()
     
    12361245            self.init_cuda(options.get("cuda_device", -1))
    12371246            record_device_success(self.cuda_device_id)
    12381247        except Exception, e:
     1248            log("init_cuda failed", exc_info=True)
    12391249            record_device_failure(self.cuda_device_id)
    12401250            raise e
    12411251
     
    12431253        log("init_context%s took %1.fms", (width, height, src_format, quality, speed, options), (end-start)*1000.0)
    12441254
    12451255    cdef init_cuda(self, preferred_device_id=-1):
     1256        cdef int plane_size_div
     1257        cdef int max_input_stride
     1258
    12461259        self.cuda_device_id, self.cuda_device = select_device(preferred_device_id, min_compute=MIN_COMPUTE)
    12471260        assert self.cuda_device, "no NVENC device found!"
    12481261        global context_counter, last_context_failure
     
    12651278        #use alias to make code easier to read:
    12661279        da = driver.device_attribute
    12671280        try:
    1268             if USE_YUV444P:
    1269                 #FIXME: YUV444P doesn't work and I don't know why
    1270                 #No idea what "separateColourPlaneFlag" is meant to do either
    1271                 self.kernel_name, self.kernel = get_BGRA2YUV444P(self.cuda_device_id)
     1281            #if supported (separate plane flag), use YUV444P:
     1282            if self.separate_plane:
     1283                kernel_gen = (get_BGRA2Y, get_BGRA2U, get_BGRA2V)
    12721284                self.bufferFmt = NV_ENC_BUFFER_FORMAT_YUV444_PL
    12731285                self.pixel_format = "YUV444P"
    12741286                #3 full planes:
    12751287                plane_size_div = 1
    12761288            else:
    1277                 self.kernel_name, self.kernel = get_BGRA2NV12(self.cuda_device_id)
     1289                kernel_gen = (get_BGRA2NV12,)
    12781290                self.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL
    12791291                self.pixel_format = "NV12"
    12801292                #1 full Y plane and 2 U+V planes subsampled by 4:
    12811293                plane_size_div = 2
    12821294
     1295            #generate and compile the kernels:
     1296            self.kernel_names = []
     1297            self.kernels = []
     1298            for genk in kernel_gen:
     1299                kernel_name, kernel = genk(self.cuda_device_id)
     1300                self.kernel_names.append(kernel_name)
     1301                self.kernels.append(kernel)
     1302
    12831303            #allocate CUDA input buffer (on device) 32-bit RGB
    12841304            #(and make it bigger just in case - subregions from XShm can have a huge rowstride):
    12851305            max_input_stride = max(2560, self.input_width)*4
     
    12971317            log("max_block_sizes=%s", self.max_block_sizes)
    12981318            log("max_grid_sizes=%s", self.max_grid_sizes)
    12991319
    1300             self.max_threads_per_block = self.kernel.get_attribute(driver.function_attribute.MAX_THREADS_PER_BLOCK)
     1320            #should be the same for all kernels... but cheap to be pedantic:
     1321            self.max_threads_per_block = min([kernel.get_attribute(driver.function_attribute.MAX_THREADS_PER_BLOCK) for kernel in self.kernels])
    13011322            log("max_threads_per_block=%s", self.max_threads_per_block)
    13021323
    13031324            self.init_nvenc()
     
    13421363            if presetConfig!=NULL:
    13431364                presetConfig.presetCfg.encodeCodecConfig.h264Config.enableVFR = 1
    13441365                presetConfig.presetCfg.encodeCodecConfig.h264Config.idrPeriod = NVENC_INFINITE_GOPLENGTH
    1345                 #needed for YUV444P?
    1346                 #presetConfig.presetCfg.encodeCodecConfig.h264Config.separateColourPlaneFlag = 1
     1366                if self.pixel_format=="YUV444P":
     1367                    presetConfig.presetCfg.encodeCodecConfig.h264Config.separateColourPlaneFlag = 1
    13471368                params.encodeConfig = &presetConfig.presetCfg
    13481369            else:
    13491370                self.preset_name = None
     
    14951516        client_options = {"frame" : self.frames}
    14961517        if self.scaling!=(1,1):
    14971518            client_options["scaled_size"] = self.encoder_width, self.encoder_height
     1519            #self.pixel_format
    14981520        return client_options
    14991521
    15001522    def set_encoding_speed(self, speed):
     
    15681590
    15691591        #copy input buffer to CUDA buffer:
    15701592        driver.memcpy_htod(self.cudaInputBuffer, self.inputBuffer)
     1593        self.bytes_in += input_size
    15711594        log("compress_image(..) input buffer copied to device")
    15721595
    15731596        #FIXME: find better values and validate against max_block/max_grid:
    1574         blockw, blockh = 16, 16
    15751597        if self.pixel_format=="NV12":
     1598            #just one pass with no offset:
     1599            offsets = [0]
    15761600            #(these values are derived from the kernel code - which we should know nothing about here..)
    15771601            #divide each dimension by 2 since we process 4 pixels at a time:
    15781602            dx, dy = 2, 2
    15791603        else:
    1580             #YUV444P does one pixel at a time:
     1604            assert self.pixel_format=="YUV444P"
     1605            #3 passes, one for each of Y, U and V
     1606            offsets = [0, self.encoder_height*self.outputPitch, 2*self.encoder_height*self.outputPitch]
     1607            #one pixel at a time:
    15811608            dx, dy = 1, 1
     1609
     1610        #calculate grids/blocks:
     1611        #a block is a group of threads: (blockw * blockh) threads
     1612        #a grid is a group of blocks: (gridw * gridh) blocks
     1613        blockw, blockh = 16, 16
    15821614        gridw = max(1, w/blockw/dx)
    15831615        if gridw*2*blockw<w:
    15841616            gridw += 1
     
    15861618        #if dy made us round down, add one:
    15871619        if gridh*dy*blockh<h:
    15881620            gridh += 1
    1589         log("compress_image(..) calling CUDA CSC kernel %s", self.kernel_name)
    15901621        in_w, in_h = self.input_width, self.input_height
    15911622        if self.scaling!=(1,1):
    15921623            #scaling so scale exact dimensions, not padded input dimensions:
    15931624            in_w, in_h = w, h
    1594         self.kernel(self.cudaInputBuffer, numpy.int32(in_w), numpy.int32(in_h), numpy.int32(stride),
    1595                     self.cudaOutputBuffer, numpy.int32(self.encoder_width), numpy.int32(self.encoder_height), numpy.int32(self.outputPitch),
    1596                     numpy.int32(w), numpy.int32(h),
    1597                     block=(blockw,blockh,1), grid=(gridw, gridh))
    1598         #a block is a group of threads: (blockw * blockh) threads
    1599         #a grid is a group of blocks: (gridw * gridh) blocks
    1600         csc_end = time.time()
    1601         log("compress_image(..) kernel executed - CSC took %.1f ms", (csc_end - start)*1000.0)
    16021625
    1603         #map buffer so nvenc can access it:
    1604         memset(&mapInputResource, 0, sizeof(NV_ENC_MAP_INPUT_RESOURCE))
    1605         mapInputResource.version = NV_ENC_MAP_INPUT_RESOURCE_VER
    1606         mapInputResource.registeredResource  = self.inputHandle
    1607         raiseNVENC(self.functionList.nvEncMapInputResource(self.context, &mapInputResource), "mapping input resource")
    1608         log("compress_image(..) device buffer mapped to %#x", <unsigned long> mapInputResource.mappedResource)
     1626        #for storing result:
     1627        data = []
     1628        sizes = []
    16091629
    1610         size = 0
    1611         try:
    1612             memset(&picParams, 0, sizeof(NV_ENC_PIC_PARAMS))
    1613             picParams.version = NV_ENC_PIC_PARAMS_VER
    1614             picParams.bufferFmt = self.bufferFmt
    1615             picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME
    1616             picParams.inputWidth = self.encoder_width
    1617             picParams.inputHeight = self.encoder_height
    1618             picParams.inputPitch = self.outputPitch
    1619             picParams.inputBuffer = mapInputResource.mappedResource
    1620             picParams.outputBitstream = self.bitstreamBuffer
    1621             #picParams.pictureType: required when enablePTD is disabled
    1622             if self.frames==0:
    1623                 #only the first frame needs to be IDR (as we never lose frames)
    1624                 picParams.pictureType = NV_ENC_PIC_TYPE_IDR
    1625             else:
    1626                 picParams.pictureType = NV_ENC_PIC_TYPE_P
    1627             picParams.codecPicParams.h264PicParams.displayPOCSyntax = 2*self.frames
    1628             picParams.codecPicParams.h264PicParams.refPicFlag = self.frames==0
    1629             picParams.codecPicParams.h264PicParams.sliceMode = 3            #sliceModeData specifies the number of slices
    1630             picParams.codecPicParams.h264PicParams.sliceModeData = 1        #1 slice!
    1631             #picParams.encodePicFlags = NV_ENC_PIC_FLAG_OUTPUT_SPSPPS
    1632             picParams.frameIdx = self.frames
    1633             #picParams.inputTimeStamp = int(1000.0 * time.time())
    1634             #inputDuration = 0      #FIXME: use frame delay?
    1635             picParams.rcParams.rateControlMode = NV_ENC_PARAMS_RC_VBR     #FIXME: check NV_ENC_CAPS_SUPPORTED_RATECONTROL_MODES caps
    1636             picParams.rcParams.averageBitRate = 5000000   #5Mbits/s
    1637             picParams.rcParams.maxBitRate = 10000000      #10Mbits/s
     1630        step = 0
     1631        for step in range(len(offsets)):
     1632            offset = offsets[step]
     1633            kernel = self.kernels[step]
     1634            kernel_name = self.kernel_names[step]
     1635            csc_start = time.time()
     1636            log("compress_image(..) calling CUDA CSC kernel %s", kernel_name)
     1637            kernel(self.cudaInputBuffer, numpy.int32(in_w), numpy.int32(in_h), numpy.int32(stride),
     1638                   self.cudaOutputBuffer, numpy.int32(self.encoder_width), numpy.int32(self.encoder_height), numpy.int32(self.outputPitch),
     1639                   numpy.int32(w), numpy.int32(h),
     1640                   block=(blockw,blockh,1), grid=(gridw, gridh))
     1641            csc_end = time.time()
     1642            log("compress_image(..) kernel %s executed - CSC took %.1f ms", kernel_name, (csc_end - csc_start)*1000.0)
    16381643
    1639             raiseNVENC(self.functionList.nvEncEncodePicture(self.context, &picParams), "error during picture encoding")
    1640             encode_end = time.time()
    1641             log("compress_image(..) encoded in %.1f ms", (encode_end-csc_end)*1000.0)
     1644            #map buffer so nvenc can access it:
     1645            memset(&mapInputResource, 0, sizeof(NV_ENC_MAP_INPUT_RESOURCE))
     1646            mapInputResource.version = NV_ENC_MAP_INPUT_RESOURCE_VER
     1647            mapInputResource.registeredResource  = self.inputHandle
     1648            raiseNVENC(self.functionList.nvEncMapInputResource(self.context, &mapInputResource), "mapping input resource")
     1649            log("compress_image(..) device buffer mapped to %#x", <unsigned long> mapInputResource.mappedResource)
     1650   
     1651            size = 0
     1652            try:
     1653                memset(&picParams, 0, sizeof(NV_ENC_PIC_PARAMS))
     1654                picParams.version = NV_ENC_PIC_PARAMS_VER
     1655                picParams.bufferFmt = self.bufferFmt
     1656                picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME
     1657                picParams.inputWidth = self.encoder_width
     1658                picParams.inputHeight = self.encoder_height
     1659                picParams.inputPitch = self.outputPitch
     1660                picParams.inputBuffer = mapInputResource.mappedResource
     1661                picParams.outputBitstream = self.bitstreamBuffer
     1662                #picParams.pictureType: required when enablePTD is disabled
     1663                if self.frames==0:
     1664                    #only the first frame needs to be IDR (as we never lose frames)
     1665                    picParams.pictureType = NV_ENC_PIC_TYPE_IDR
     1666                else:
     1667                    picParams.pictureType = NV_ENC_PIC_TYPE_P
     1668                picParams.codecPicParams.h264PicParams.displayPOCSyntax = 2*self.frames
     1669                picParams.codecPicParams.h264PicParams.refPicFlag = self.frames==0
     1670                picParams.codecPicParams.h264PicParams.sliceMode = 3            #sliceModeData specifies the number of slices
     1671                picParams.codecPicParams.h264PicParams.sliceModeData = 1        #1 slice!
     1672                #picParams.encodePicFlags = NV_ENC_PIC_FLAG_OUTPUT_SPSPPS
     1673                picParams.frameIdx = self.index
     1674                self.index += 1
     1675                #picParams.inputTimeStamp = int(1000.0 * time.time())
     1676                #inputDuration = 0      #FIXME: use frame delay?
     1677                picParams.rcParams.rateControlMode = NV_ENC_PARAMS_RC_VBR     #FIXME: check NV_ENC_CAPS_SUPPORTED_RATECONTROL_MODES caps
     1678                picParams.rcParams.averageBitRate = 5000000   #5Mbits/s
     1679                picParams.rcParams.maxBitRate = 10000000      #10Mbits/s
     1680   
     1681                raiseNVENC(self.functionList.nvEncEncodePicture(self.context, &picParams), "error during picture encoding")
     1682                encode_end = time.time()
     1683                log("compress_image(..) encoded in %.1f ms", (encode_end-csc_end)*1000.0)
     1684   
     1685                #lock output buffer:
     1686                memset(&lockOutputBuffer, 0, sizeof(NV_ENC_LOCK_BITSTREAM))
     1687                lockOutputBuffer.version = NV_ENC_LOCK_BITSTREAM_VER
     1688                lockOutputBuffer.doNotWait = 0
     1689                lockOutputBuffer.outputBitstream = self.bitstreamBuffer
     1690                raiseNVENC(self.functionList.nvEncLockBitstream(self.context, &lockOutputBuffer), "locking output buffer")
     1691                log("compress_image(..) output buffer locked, bitstreamBufferPtr=%#x", <unsigned long> lockOutputBuffer.bitstreamBufferPtr)
     1692   
     1693                #copy to python buffer:
     1694                size = lockOutputBuffer.bitstreamSizeInBytes
     1695                sizes.append(size)
     1696                self.bytes_out += size
     1697                data.append((<char *> lockOutputBuffer.bitstreamBufferPtr)[:size])
     1698            finally:
     1699                raiseNVENC(self.functionList.nvEncUnlockBitstream(self.context, self.bitstreamBuffer), "unlocking output buffer")
     1700                raiseNVENC(self.functionList.nvEncUnmapInputResource(self.context, mapInputResource.mappedResource), "unmapping input resource")
    16421701
    1643             #lock output buffer:
    1644             memset(&lockOutputBuffer, 0, sizeof(NV_ENC_LOCK_BITSTREAM))
    1645             lockOutputBuffer.version = NV_ENC_LOCK_BITSTREAM_VER
    1646             lockOutputBuffer.doNotWait = 0
    1647             lockOutputBuffer.outputBitstream = self.bitstreamBuffer
    1648             raiseNVENC(self.functionList.nvEncLockBitstream(self.context, &lockOutputBuffer), "locking output buffer")
    1649             log("compress_image(..) output buffer locked, bitstreamBufferPtr=%#x", <unsigned long> lockOutputBuffer.bitstreamBufferPtr)
    1650 
    1651             #copy to python buffer:
    1652             size = lockOutputBuffer.bitstreamSizeInBytes
    1653             data = (<char *> lockOutputBuffer.bitstreamBufferPtr)[:size]
    1654         finally:
    1655             raiseNVENC(self.functionList.nvEncUnlockBitstream(self.context, self.bitstreamBuffer), "unlocking output buffer")
    1656             raiseNVENC(self.functionList.nvEncUnmapInputResource(self.context, mapInputResource.mappedResource), "unmapping input resource")
    1657 
    16581702        end = time.time()
    16591703        log("compress_image(..) download took %.1f ms", (end-encode_end)*1000.0)
    16601704        #update info:
     
    16621706
    16631707        self.last_frame_times.append((start, end))
    16641708        self.time += end-start
    1665         log("compress_image(..) returning %s bytes (%.1f%%), complete compression for frame %s took %.1fms", size, 100.0*size/input_size, self.frames, 1000.0*(end-start))
     1709        outdata = "".join(data)
     1710        outsize = len(outdata)
     1711        log("compress_image(..) returning %s bytes (%.1f%%), complete compression for frame %s took %.1fms", outsize, 100.0*outsize/input_size, self.frames, 1000.0*(end-start))
    16661712        #log("pixels head: %s", binascii.hexlify(data[:128]))
    16671713        client_options = self.get_client_options(options)
    1668         self.bytes_in += input_size
    1669         self.bytes_out += size
     1714        if self.pixel_format=="YUV444P":
     1715            assert len(offsets)==3
     1716            #tell the client that the data contains 3 chunks
     1717            #and how to find them in the joined buffer:
     1718            client_options["plane_sizes"] = sizes
    16701719        self.frames += 1
    1671         return data, client_options
     1720        return outdata, client_options
    16721721
    16731722
    16741723    cdef NV_ENC_PRESET_CONFIG *get_preset_config(self, name, GUID encode_GUID, GUID preset_GUID):
  • xpra/server/proxy_instance_process.py

     
    286286            caps = packet[1]
    287287            if caps.get("info_request", False):
    288288                proto.send_now(("hello", self.get_proxy_info(proto)))
    289                 self.timeout_add(5*1000, self.send_disconnect, "info sent")
     289                self.timeout_add(5*1000, self.send_disconnect, proto, "info sent")
    290290                return
    291291            elif caps.get("stop_request", False):
    292292                self.stop("socket request", None)
  • xpra/server/window_video_source.py

     
    665665            for out_csc, csc_spec in csc_specs:
    666666                actual_csc = self.csc_equiv(out_csc)
    667667                if actual_csc in self.csc_modes and (not bool(FORCE_CSC_MODE) or FORCE_CSC_MODE==out_csc):
    668                     add_scores("via %s" % out_csc, csc_spec, out_csc)
     668                    add_scores("via %s (%s)" % (out_csc, actual_csc), csc_spec, out_csc)
    669669        s = sorted(scores, key=lambda x : -x[0])
    670670        scorelog("get_video_pipeline_options%s scores=%s", (encoding, width, height, src_format), s)
    671671        return s
     
    674674        #in some places, we want to check against the subsampling used
    675675        #and not the colorspace itself.
    676676        #and NV12 uses the same subsampling as YUV420P...
    677         return {"NV12" : "YUV420P"}.get(csc_mode, csc_mode)
     677        return {"NV12" : "YUV420P",
     678                "BGRX" : "YUV444P"}.get(csc_mode, csc_mode)
    678679
    679680
    680681    def get_quality_score(self, csc_format, csc_spec, encoder_spec):
     
    10521053                #(note: see csc_equiv!)
    10531054                if self.uses_csc_atoms:
    10541055                    client_options["csc"] = self.csc_equiv(csc)
     1056                    log.info("csc=%s", self.csc_equiv(csc))
    10551057                else:
    10561058                    #ugly hack: expose internal ffmpeg/libav constant
    10571059                    #for old versions without the "csc_atoms" feature:
     
    10601062                #(unless the video encoder has already done so):
    10611063                if self._csc_encoder and ("scaled_size" not in client_options) and (enc_width!=width or enc_height!=height):
    10621064                    client_options["scaled_size"] = enc_width, enc_height
    1063             log("video_encode encoder: %s %sx%s result is %s bytes (%.1f MPixels/s), client options=%s",
     1065            log.info("video_encode encoder: %s %sx%s result is %s bytes (%.1f MPixels/s), client options=%s",
    10641066                                encoding, enc_width, enc_height, len(data), (enc_width*enc_height/(end-start+0.000001)/1024.0/1024.0), client_options)
    10651067            return self._video_encoder.get_type(), Compressed(encoding, data), client_options, width, height, 0, 24
    10661068        finally: