Commit e84136ba authored by Sebastian Dröge's avatar Sebastian Dröge

videomixer: Port most blending related functions to orc

Only remaining MMX implementation is the ARGB/BGRA/AYUV blending
for which we first need the orc compositing opcodes.
parent e87802ef
plugin_LTLIBRARIES = libgstvideomixer.la
ORC_SOURCE=blendorc
include $(top_srcdir)/common/orc.mak
libgstvideomixer_la_SOURCES = videomixer.c blend.c
nodist_libgstvideomixer_la_SOURCES = $(ORC_NODIST_SOURCES)
libgstvideomixer_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) \
$(GST_BASE_CFLAGS) $(GST_CONTROLLER_CFLAGS) $(GST_CFLAGS) $(ORC_CFLAGS)
libgstvideomixer_la_LIBADD = $(GST_PLUGINS_BASE_LIBS) -lgstvideo-@GST_MAJORMINOR@ \
......
This diff is collapsed.
......@@ -122,229 +122,3 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width,
}
#endif
#ifdef GENERIC
static inline void
_memcpy_u8_mmx (guint8 * dest, const guint8 * src, guint count)
{
/* *INDENT-OFF* */
__asm__ __volatile__ (
"1: \n\t"
"test $7, %0 \n\t"
"je 3f \n\t"
"2: \n\t"
"movb (%2), %%al \n\t"
"movb %%al, (%1) \n\t"
"inc %2 \n\t"
"inc %1 \n\t"
"dec %0 \n\t"
"test $7, %0 \n\t"
"jne 2b \n\t"
"3: \n\t"
"sar $3, %0 \n\t"
"cmp $0, %0 \n\t"
"je 5f \n\t"
"4: \n\t"
"movq (%2), %%mm0 \n\t"
"movq %%mm0, (%1) \n\t"
"add $8, %2 \n\t"
"add $8, %1 \n\t"
"dec %0 \n\t"
"jne 4b \n\t"
"5: \n\t"
"emms \n\t"
: "=r" (count), "=q" (dest), "=q" (src)
: "0" (count), "1" (dest), "2" (src)
: "memory", "al",
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
#ifdef __MMX__
, "mm0"
#endif
);
/* *INDENT-ON* */
}
static inline void
_memset_u8_mmx (guint8 * dest, guint val, guint count)
{
guint8 val8 = val;
guint64 val64;
val64 = (val << 24) | (val << 16) | (val << 8) | (val);
val64 = (val64 << 32) | val64;
/* *INDENT-OFF* */
__asm__ __volatile__ (
"1: \n\t"
"test $7, %0 \n\t"
"je 3f \n\t"
"2: \n\t"
"movb %4, (%1) \n\t"
"inc %1 \n\t"
"dec %0 \n\t"
"test $7, %0 \n\t"
"jne 2b \n\t"
"3: \n\t"
"sar $3, %0 \n\t"
"cmp $0, %0 \n\t"
"je 5f \n\t"
"movq %5, %%mm0 \n\t"
"4: \n\t"
"movq %%mm0, (%1) \n\t"
"add $8, %1 \n\t"
"dec %0 \n\t"
"jne 4b \n\t"
"5: \n\t"
"emms \n\t"
: "=r" (count), "=q" (dest)
: "0" (count), "1" (dest), "q" (val8), "m" (val64)
: "memory",
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
#ifdef __MMX__
, "mm0"
#endif
);
/* *INDENT-ON* */
}
static inline void
_memset_u32_mmx (guint32 * dest, guint32 val, guint count)
{
guint64 val64 = val;
val64 |= (val64 << 32);
/* *INDENT-OFF* */
__asm__ __volatile__ (
"1: \n\t"
"test $1, %0 \n\t"
"je 3f \n\t"
"2: \n\t"
"movl %4, (%1) \n\t"
"add $4, %1 \n\t"
"dec %0 \n\t"
"test $1, %0 \n\t"
"jne 2b \n\t"
"3: \n\t"
"sar $1, %0 \n\t"
"cmp $0, %0 \n\t"
"je 5f \n\t"
"movq %5, %%mm0 \n\t"
"4: \n\t"
"movq %%mm0, (%1) \n\t"
"add $8, %1 \n\t"
"dec %0 \n\t"
"jne 4b \n\t"
"5: \n\t"
"emms \n\t"
: "=r" (count), "=r" (dest)
: "0" (count), "1" (dest), "r" (val), "m" (val64)
: "memory",
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
#ifdef __MMX__
, "mm0"
#endif
);
/* *INDENT-ON* */
}
static inline void
_blend_u8_mmx (guint8 * dest, const guint8 * src,
gint src_stride, gint dest_stride, gint src_width, gint src_height,
gint dest_width, gint s_alpha)
{
gint i;
gint src_add = src_stride - src_width;
gint dest_add = dest_stride - src_width;
for (i = 0; i < src_height; i++) {
/* Do first 3 "odd" pixels */
while ((src_width & 0x03)) {
*dest = BLEND (*dest, *src, s_alpha);
dest++;
src++;
src_width--;
}
/* (P1 * (256 - A) + (P2 * A)) / 256
* => (P1 * 256 - P1 * A + P2 * A) / 256
* => (P1 * 256 + A * (P2 - P1) / 256
* => P1 + (A * (P2 - P1)) / 256
*/
/* *INDENT-OFF* */
__asm__ __volatile__ (
" mov %4 , %%eax \n\t" /* eax = s_alpha */
" movd %%eax , %%mm6 \n\t" /* mm6 = s_alpha */
" punpcklwd %%mm6 , %%mm6 \n\t" /* mm6 = 00 00 00 00 00 aa 00 aa, alpha scale factor */
" punpckldq %%mm6 , %%mm6 \n\t" /* mm6 = 00 aa 00 aa 00 aa 00 aa */
" pxor %%mm7 , %%mm7 \n\t" /* mm7 = 00 00 00 00 00 00 00 00 */
" movl %5 , %%ecx \n\t" /* ecx = src_width */
"1: \n\t"
" test $7 , %%ecx \n\t"
" je 2f \n\t"
/* do first 4 "odd" bytes */
" movd (%2) , %%mm2 \n\t" /* mm2 = src, 00 00 00 00 sv su sy sa */
" movd (%3) , %%mm1 \n\t" /* mm1 = dest, 00 00 00 00 dv du dy da */
" punpcklbw %%mm7 , %%mm2 \n\t"
" punpcklbw %%mm7 , %%mm1 \n\t"
" psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */
" pmullw %%mm6 , %%mm2 \n\t" /* mm2 = a * mm2 */
" psllw $8 , %%mm1 \n\t" /* scale up */
" paddw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 + mm1 */
" psrlw $8 , %%mm2 \n\t" /* scale down */
" packuswb %%mm2 , %%mm2 \n\t"
" movd %%mm2 , (%3) \n\t" /* dest = mm1 */
" add $4 , %1 \n\t"
" add $4 , %0 \n\t"
"2: \n\t"
" sar $3 , %%ecx \n\t" /* prepare for 8 bytes per loop */
" cmp $0 , %%ecx \n\t"
" je 4f \n\t"
"3: \n\t"
/* do even pixels */
" movq (%2) , %%mm2 \n\t" /* mm2 = src, sv1 su1 sy1 sa1 sv0 su0 sy0 sa0 */
" movq (%3) , %%mm1 \n\t" /* mm1 = dest, dv1 du1 dy1 da1 dv0 du0 dy0 da0 */
" movq %%mm2 , %%mm4 \n\t"
" movq %%mm1 , %%mm3 \n\t"
" punpcklbw %%mm7 , %%mm2 \n\t"
" punpckhbw %%mm7 , %%mm4 \n\t"
" punpcklbw %%mm7 , %%mm1 \n\t"
" punpckhbw %%mm7 , %%mm3 \n\t"
" psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */
" psubw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 - mm3 */
" pmullw %%mm6 , %%mm2 \n\t" /* mm2 = a * mm2 */
" pmullw %%mm6 , %%mm4 \n\t" /* mm2 = a * mm2 */
" psllw $8 , %%mm1 \n\t" /* scale up */
" psllw $8 , %%mm3 \n\t" /* scale up */
" paddw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 + mm1 */
" paddw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 + mm3 */
" psrlw $8 , %%mm2 \n\t" /* scale down */
" psrlw $8 , %%mm4 \n\t" /* scale down */
" packuswb %%mm4 , %%mm2 \n\t"
" movq %%mm2 , (%3) \n\t"
" add $8 , %0 \n\t"
" add $8 , %1 \n\t"
" dec %%ecx \n\t"
" jne 3b \n\t"
"4: \n\t"
:"=r" (src), "=r" (dest)
:"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width)
:"%eax", "%ecx", "memory",
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
#ifdef __MMX__
, "mm1", "mm2", "mm3", "mm4", "mm6", "mm7"
#endif
);
/* *INDENT-ON* */
src += src_add;
dest += dest_add;
}
__asm__ __volatile__ ("emms");
}
#endif
This diff is collapsed.
/* autogenerated from blendorc.orc */
#ifndef _BLENDORC_H_
#define _BLENDORC_H_
#include <glib.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifndef _ORC_INTEGER_TYPEDEFS_
#define _ORC_INTEGER_TYPEDEFS_
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
#include <stdint.h>
typedef int8_t orc_int8;
typedef int16_t orc_int16;
typedef int32_t orc_int32;
typedef int64_t orc_int64;
typedef uint8_t orc_uint8;
typedef uint16_t orc_uint16;
typedef uint32_t orc_uint32;
typedef uint64_t orc_uint64;
#elif defined(_MSC_VER)
typedef signed __int8 orc_int8;
typedef signed __int16 orc_int16;
typedef signed __int32 orc_int32;
typedef signed __int64 orc_int64;
typedef unsigned __int8 orc_uint8;
typedef unsigned __int16 orc_uint16;
typedef unsigned __int32 orc_uint32;
typedef unsigned __int64 orc_uint64;
#else
#include <limits.h>
typedef signed char orc_int8;
typedef short orc_int16;
typedef int orc_int32;
typedef unsigned char orc_uint8;
typedef unsigned short orc_uint16;
typedef unsigned int orc_uint32;
#if INT_MAX == LONG_MAX
typedef long long orc_int64;
typedef unsigned long long orc_uint64;
#else
typedef long orc_int64;
typedef unsigned long orc_uint64;
#endif
#endif
typedef union { orc_int32 i; float f; } orc_union32;
typedef union { orc_int64 i; double f; } orc_union64;
#endif
void orc_splat_u32 (guint32 * d1, int p1, int n);
void orc_memcpy_u32 (guint32 * d1, const guint32 * s1, int n);
void orc_blend_u8 (guint8 * d1, int d1_stride, const guint8 * s1, int s1_stride, int p1, int n, int m);
#ifdef __cplusplus
}
#endif
#endif
.function orc_splat_u32
.dest 4 d1 guint32
.param 4 p1 guint32
copyl d1, p1
.function orc_memcpy_u32
.dest 4 d1 guint32
.source 4 s1 guint32
copyl d1, s1
.function orc_blend_u8
.flags 2d
.dest 1 d1 guint8
.source 1 s1 guint8
.param 2 p1
.temp 2 t1
.temp 2 t2
.const 1 c1 8
convubw t1, d1
convubw t2, s1
subw t2, t2, t1
mullw t2, t2, p1
shlw t1, t1, c1
addw t2, t1, t2
shruw t2, t2, c1
convsuswb d1, t2
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment