xpra icon
Bug tracker and wiki

Ticket #925: opencl-imageinterface.patch

File opencl-imageinterface.patch, 12.2 KB (added by Antoine Martin, 4 years ago)

use the image interface instead of buffers, more like the working example

  • xpra/codecs/csc_opencl/colorspace_converter.py

     
    322322    def add_rgb_to_yuv(src_rgb_mode, kernel_rgb_mode, upload_rgb_mode, channel_order):
    323323        log("add_rgb_to_yuv%s", (src_rgb_mode, kernel_rgb_mode, upload_rgb_mode, CHANNEL_ORDER_TO_STR.get(channel_order)))
    324324        kernels = gen_rgb_to_yuv_kernels(kernel_rgb_mode)
    325         #log("kernels(%s)=%s", rgb_mode, kernels)
     325        log("gen_rgb_to_yuv_kernels(%s)=%s", kernel_rgb_mode, kernels)
    326326        for key, k_def in kernels.items():
    327327            ksrc, dst = key
    328328            assert ksrc==kernel_rgb_mode
     
    554554        global context, program
    555555        self.context = context
    556556        self.program = program
    557         self.queue = pyopencl.CommandQueue(self.context)
    558557        fm = pyopencl.filter_mode.NEAREST
    559558        self.sampler = pyopencl.Sampler(self.context, False, pyopencl.addressing_mode.CLAMP_TO_EDGE, fm)
    560559        k_def = KERNELS_DEFS.get((self.src_format, self.dst_format))
     
    705704        wheight = dimdiv(self.dst_height, max(y_div for _, y_div in divs))
    706705        globalWorkSize, localWorkSize  = self.get_work_sizes(wwidth, wheight)
    707706
     707        self.queue = pyopencl.CommandQueue(self.context)
     708
    708709        kernelargs = [self.queue, globalWorkSize, localWorkSize]
    709710
    710711        iformat = pyopencl.ImageFormat(pyopencl.channel_order.R, pyopencl.channel_type.UNSIGNED_INT8)
     
    742743        out_array = numpy.empty(self.dst_width*self.dst_height*4, dtype=numpy.byte)
    743744        pyopencl.enqueue_read_image(self.queue, oimage, (0, 0), (self.dst_width, self.dst_height), out_array)
    744745        self.queue.finish()
     746        self.queue = None
    745747        log("readback using %s took %.1fms", CHANNEL_ORDER_TO_STR.get(self.channel_order), 1000.0*(time.time()-kend))
    746748        self.time += time.time()-start
    747749        self.frames += 1
     
    772774        iformat = pyopencl.ImageFormat(self.channel_order, pyopencl.channel_type.UNSIGNED_INT8)
    773775        shape = (stride//4, self.src_height)
    774776        log("convert_image() type=%s, input image format=%s, shape=%s, work size: local=%s, global=%s", type(pixels), iformat, shape, localWorkSize, globalWorkSize)
    775         idata = memoryview_to_bytes(pixels)
    776         if type(idata)==str:
    777             #str is not a buffer, so we have to copy the data
    778             #alternatively, we could copy it first ourselves using this:
    779             #pixels = numpy.fromstring(pixels, dtype=numpy.byte).data
    780             #but I think this would be even slower
    781             flags = mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR
    782         else:
    783             flags = mem_flags.READ_ONLY | mem_flags.USE_HOST_PTR
    784         iimage = pyopencl.Image(self.context, flags, iformat, shape=shape, hostbuf=idata)
     777        flags = mem_flags.READ_ONLY | mem_flags.USE_HOST_PTR
     778        iimage = pyopencl.Image(self.context, flags, iformat, shape, None, memoryview_to_bytes(pixels))
    785779
     780        self.queue = pyopencl.CommandQueue(self.context)
    786781        kernelargs = [self.queue, globalWorkSize, localWorkSize,
    787782                      iimage, numpy.int32(self.src_width), numpy.int32(self.src_height),
    788783                      numpy.int32(self.dst_width), numpy.int32(self.dst_height),
     
    792787        strides = []
    793788        out_buffers = []
    794789        out_sizes = []
     790        oformat = pyopencl.ImageFormat(pyopencl.channel_order.R, pyopencl.channel_type.UNSIGNED_INT8)
    795791        for i in range(3):
    796792            x_div, y_div = divs[i]
    797793            p_stride = roundup(self.dst_width // x_div, max(2, localWorkSize[0]))
     
    798794            p_height = roundup(self.dst_height // y_div, 2)
    799795            p_size = p_stride * p_height
    800796            #log("output buffer for channel %s: stride=%s, height=%s, size=%s", i, p_stride, p_height, p_size)
    801             out_buf = pyopencl.Buffer(self.context, mem_flags.WRITE_ONLY, p_size)
     797            out_buf = pyopencl.Image(self.context, pyopencl.mem_flags.WRITE_ONLY,
     798                            oformat, (p_stride, p_height))
    802799            out_buffers.append(out_buf)
    803800            kernelargs += [out_buf, numpy.int32(p_stride)]
    804801            strides.append(p_stride)
     
    806803
    807804        kstart = time.time()
    808805        log("convert_image(%s) calling %s%s after %.1fms", image, self.kernel_function_name, tuple(kernelargs), 1000.0*(kstart-start))
    809         self.kernel_function(*kernelargs)
    810         self.queue.finish()
     806        self.kernel_function(*kernelargs).wait()
     807        self.queue.flush()
    811808        #free input image:
    812809        iimage.release()
    813810        kend = time.time()
     
    814811        log("%s took %.1fms", self.kernel_function_name, 1000.0*(kend-kstart))
    815812
    816813        #read back:
    817         pixels = []
     814        narrays = []
    818815        for i in range(3):
    819             out_array = numpy.empty(out_sizes[i], dtype=numpy.byte)
    820             pixels.append(out_array.data)
    821             pyopencl.enqueue_read_buffer(self.queue, out_buffers[i], out_array, is_blocking=False)
     816            x_div, y_div = divs[i]
     817            p_stride = roundup(self.dst_width // x_div, max(2, localWorkSize[0]))
     818            p_height = roundup(self.dst_height // y_div, 2)
     819            out_array = numpy.zeros(out_sizes[i], dtype=numpy.uint8)
     820            pyopencl.enqueue_read_image(self.queue, out_buffers[i],
     821                        (0, 0, 0), (p_stride, p_height, 1), out_array).wait()
     822            narrays.append(out_array)
     823            #pyopencl.enqueue_read_buffer(self.queue, out_buffers[i], out_array, is_blocking=None).wait()
    822824        readstart = time.time()
    823825        log("queue read events took %.1fms (3 planes of size %s, with strides=%s)", 1000.0*(readstart-kend), out_sizes, strides)
    824826        self.queue.finish()
     827        self.queue = None
    825828        readend = time.time()
    826829        log("wait for read events took %.1fms", 1000.0*(readend-readstart))
     830        pixels = []
     831        for out_array in narrays:
     832            pixels.append(out_array.tobytes())
    827833        #free output buffers:
    828834        for out_buf in out_buffers:
    829835            out_buf.release()
  • xpra/codecs/csc_opencl/opencl_kernels.py

     
    358358__kernel void %s(read_only image2d_t src,
    359359              const uint srcw, const uint srch, const uint w, const uint h,
    360360              const sampler_t sampler,
    361               global uchar *dstY, const uint strideY,
    362               global uchar *dstU, const uint strideU,
    363               global uchar *dstV, const uint strideV) {
     361              __write_only image2d_t dstY, const uint strideY,
     362              __write_only image2d_t dstU, const uint strideU,
     363              __write_only image2d_t dstV, const uint strideV) {
    364364    const uint gx = get_global_id(0);
    365365    const uint gy = get_global_id(1);
    366366
    367367    if ((gx < w) & (gy < h)) {
    368         const uint4 p = read_imageui(src, sampler, (int2)( (gx*srcw)/w, (gy*srch)/h ));
    369 
    370         dstY[gx + gy*strideY] = convert_uchar_sat_rte(%s);
    371         dstU[gx + gy*strideU] = convert_uchar_sat_rte(%s);
    372         dstV[gx + gy*strideV] = convert_uchar_sat_rte(%s);
     368        const int2 src_coord = (int2) ((gx*srcw)/w, (gy*srch)/h);
     369        const uint4 p = read_imageui(src, sampler, src_coord);
     370        const uint4 yuv;
     371        const int2 dst_coord = (int2) (gx, gy);
     372        yuv.x = yuv.y = yuv.z = 128;    //convert_uchar_sat_rte(%s);
     373        write_imageui(dstY, dst_coord, yuv);
     374        yuv.x = yuv.y = yuv.z = 16;    //convert_uchar_sat_rte(%s);
     375        write_imageui(dstU, dst_coord, yuv);
     376        yuv.x = yuv.y = yuv.z = 200;    //convert_uchar_sat_rte(%s);
     377        write_imageui(dstV, dst_coord, yuv);
    373378    }
    374379}
    375380"""
     
    400405__kernel void %s(read_only image2d_t src,
    401406              uint srcw, uint srch, uint w, uint h,
    402407              const sampler_t sampler,
    403               global uchar *dstY, uint strideY,
    404               global uchar *dstU, uint strideU,
    405               global uchar *dstV, uint strideV) {
     408              __write_only image2d_t dstY, uint strideY,
     409              __write_only image2d_t dstU, uint strideU,
     410              __write_only image2d_t dstV, uint strideV) {
    406411    const uint gx = get_global_id(0);
    407412    const uint gy = get_global_id(1);
    408413
     
    415420
    416421        //write up to 2 Y pixels:
    417422        const uint i = gx*2 + gy*strideY;
    418         dstY[i] = convert_uchar_sat_rte(%s);
     423        //dstY[i] = convert_uchar_sat_rte(%s);
    419424        //we process two pixels at a time
    420425        //if the source width is odd, this destination pixel may not exist (right edge of picture)
    421426        //(we only read it via CLAMP_TO_EDGE to calculate U and V, which do exist)
     
    422427        if (gx*2+1 < w) {
    423428            srcx = (gx*2+1)*srcw/w;
    424429            p[1] = read_imageui(src, sampler, (int2)( srcx, srcy ));
    425             dstY[i+1] = convert_uchar_sat_rte(%s);
     430            //dstY[i+1] = convert_uchar_sat_rte(%s);
    426431        }
    427432
    428433        const int R = %s;
     
    429434        const int G = %s;
    430435        const int B = %s;
    431436        //write 1 U pixel:
    432         dstU[gx + gy*strideU] = convert_uchar_sat_rte(%s);
     437        //dstU[gx + gy*strideU] = convert_uchar_sat_rte(%s);
    433438        //write 1 V pixel:
    434         dstV[gx + gy*strideV] = convert_uchar_sat_rte(%s);
     439        //dstV[gx + gy*strideV] = convert_uchar_sat_rte(%s);
    435440    }
    436441}
    437442"""
     
    462467__kernel void %s(read_only image2d_t src,
    463468              const uint srcw, const uint srch, const uint w, const uint h,
    464469              const sampler_t sampler,
    465               global uchar *dstY, const uint strideY,
    466               global uchar *dstU, const uint strideU,
    467               global uchar *dstV, const uint strideV) {
     470              __write_only image2d_t dstY, const uint strideY,
     471              __write_only image2d_t dstU, const uint strideU,
     472              __write_only image2d_t dstV, const uint strideV) {
    468473    const uint gx = get_global_id(0);
    469474    const uint gy = get_global_id(1);
    470475
     
    473478        uint srcy = gy*2*srch/h;
    474479        uint4 p[4];
    475480        p[0] = read_imageui(src, sampler, (int2)( srcx, srcy ));
    476         p[1] = p[0];
    477         p[2] = p[0];
    478         p[3] = p[0];
     481        p[3] = p[2] = p[1] = p[0];
    479482
    480         uint i = gx*2 + gy*2*strideY;
    481         dstY[i] = convert_uchar_sat_rte(%s);
     483        const uint4 yuv;
     484        int2 dst_coord = (int2) (gx*2, gy*2);
     485        yuv.x = yuv.y = yuv.z = convert_uchar_sat_rte(%s);
     486        write_imageui(dstY, dst_coord, yuv);
    482487        if (gx*2+1 < w) {
     488            dst_coord.x += 1;
    483489            srcx = (gx*2+1)*srcw/w;
    484490            p[1] = read_imageui(src, sampler, (int2)( srcx, srcy ));
    485             dstY[i+1] = convert_uchar_sat_rte(%s);
     491            yuv.x = yuv.y = yuv.z = convert_uchar_sat_rte(%s);
     492            write_imageui(dstY, dst_coord, yuv);
    486493        }
    487494        if (gy*2+1 < h) {
    488             i += strideY;
     495            dst_coord.x -= 1;
     496            dst_coord.y += 1;
    489497            srcx = gx*2*srcw/w;
    490498            srcy = (gy*2+1)*srch/h;
    491499            p[2] = read_imageui(src, sampler, (int2)( srcx, srcy ));
    492             dstY[i] = convert_uchar_sat_rte(%s);
     500            yuv.x = yuv.y = yuv.z = convert_uchar_sat_rte(%s);
     501            write_imageui(dstY, dst_coord, yuv);
    493502            if (gx*2+1 < w) {
    494503                srcx = (gx*2+1)*srcw/w;
    495504                p[3] = read_imageui(src, sampler, (int2)( srcx, srcy ));
    496                 dstY[i+1] = convert_uchar_sat_rte(%s);
     505                yuv.x = yuv.y = yuv.z = convert_uchar_sat_rte(%s);
     506                write_imageui(dstY, dst_coord, yuv);
    497507            }
    498508        }
    499509
     
    500510        const int R = %s;
    501511        const int G = %s;
    502512        const int B = %s;
     513        dst_coord = (int2) (gx, gy);
    503514        //write 1 U pixel:
    504         dstU[gx + gy*strideU] = convert_uchar_sat_rte(%s);
     515        yuv.x = yuv.y = yuv.z = convert_uchar_sat_rte(%s);
     516        write_imageui(dstU, dst_coord, yuv);
    505517        //write 1 V pixel:
    506         dstV[gx + gy*strideV] = convert_uchar_sat_rte(%s);
     518        yuv.x = yuv.y = yuv.z = convert_uchar_sat_rte(%s);
     519        write_imageui(dstV, dst_coord, yuv);
    507520    }
    508521}
    509522"""