1 From 04c686998b70b616d6c3a13cc694cf0811ba8650 Mon Sep 17 00:00:00 2001
2 From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
3 Date: Tue, 14 Jul 2009 21:50:18 +0300
4 Subject: [PATCH] ARM NEON optimized version of uyvy422_to_yuv420p function
7 gst/ffmpegcolorspace/imgconvert.c | 153 +++++++++++++++++++++++++++++++++++++
8 1 files changed, 153 insertions(+), 0 deletions(-)
10 diff --git a/gst/ffmpegcolorspace/imgconvert.c b/gst/ffmpegcolorspace/imgconvert.c
11 index 2436a20..67cb19d 100644
12 --- a/gst/ffmpegcolorspace/imgconvert.c
13 +++ b/gst/ffmpegcolorspace/imgconvert.c
14 @@ -956,6 +956,158 @@ uyvy422_to_rgb24_neon (AVPicture * dst, const AVPicture * src,
19 + * ARM NEON optimized implementation of UYVY -> YV12 convertor
22 +uyvy422_to_yuv420p_neon (AVPicture * dst, const AVPicture * src,
23 + int width, int height)
26 + (".macro convert_macroblock_uyvy422_to_yuv420p size, store_c, src, lum, cb, cr\n"
27 + /* load up to 16 source pixels in UYVY format */
28 + ".if \\size == 16\n" " vld1.8 {d0, d1, d2, d3}, [\\src]!\n" " pld [\\src, #256]\n" ".elseif \\size == 8\n" " vld1.8 {d0, d1}, [\\src]!\n" ".elseif \\size == 4\n" " vld1.8 {d0}, [\\src]!\n" ".elseif \\size == 2\n" " vld1.8 {d0[0]}, [\\src]!\n" " vld1.8 {d0[1]}, [\\src]!\n" " vld1.8 {d0[2]}, [\\src]!\n" " vld1.8 {d0[3]}, [\\src]!\n" ".elseif \\size == 1\n" " vld1.8 {d0[0]}, [\\src]!\n" " vld1.8 {d0[1]}, [\\src]!\n" " vld1.8 {d0[2]}, [\\src]!\n" ".else\n" " .error \"unsupported macroblock size\"\n" ".endif\n" " vuzp.8 d0, d1\n" /* d1 - separated Y (first 8 bytes) */
29 + " vuzp.8 d2, d3\n" /* d3 - separated Y (next 8 bytes) */
30 + " vuzp.8 d0, d2\n" /* d0 - separated U, d2 - separated V */
31 + " vswp d1, d2\n" /* exchange d1 and d2 */
32 + ".if \\size == 16\n"
33 + " vst1.8 {d2, d3}, [\\lum]!\n"
35 + " vst1.8 {d0}, [\\cb]!\n"
36 + " vst1.8 {d1}, [\\cr]!\n"
38 + ".elseif \\size == 8\n"
39 + " vst1.8 {d2}, [\\lum]!\n"
41 + " vst1.8 {d0[0]}, [\\cb]!\n"
42 + " vst1.8 {d0[1]}, [\\cb]!\n"
43 + " vst1.8 {d0[2]}, [\\cb]!\n"
44 + " vst1.8 {d0[3]}, [\\cb]!\n"
45 + " vst1.8 {d1[0]}, [\\cr]!\n"
46 + " vst1.8 {d1[1]}, [\\cr]!\n"
47 + " vst1.8 {d1[2]}, [\\cr]!\n"
48 + " vst1.8 {d1[3]}, [\\cr]!\n"
50 + ".elseif \\size == 4\n"
51 + " vst1.8 {d2[0]}, [\\lum]!\n"
52 + " vst1.8 {d2[1]}, [\\lum]!\n"
53 + " vst1.8 {d2[2]}, [\\lum]!\n"
54 + " vst1.8 {d2[3]}, [\\lum]!\n"
56 + " vst1.8 {d0[0]}, [\\cb]!\n"
57 + " vst1.8 {d0[1]}, [\\cb]!\n"
58 + " vst1.8 {d1[0]}, [\\cr]!\n"
59 + " vst1.8 {d1[1]}, [\\cr]!\n"
61 + ".elseif \\size == 2\n"
62 + " vst1.8 {d2[0]}, [\\lum]!\n"
63 + " vst1.8 {d2[1]}, [\\lum]!\n"
65 + " vst1.8 {d0[0]}, [\\cb]!\n"
66 + " vst1.8 {d1[0]}, [\\cr]!\n"
68 + ".elseif \\size == 1\n"
69 + " vst1.8 {d2[0]}, [\\lum]!\n"
71 + " vst1.8 {d0[0]}, [\\cb]!\n"
72 + " vst1.8 {d1[0]}, [\\cr]!\n"
75 + " .error \"unsupported macroblock size\"\n" ".endif\n" ".endm\n");
77 + const uint8_t *p, *p1;
78 + uint8_t *lum, *cr, *cb, *lum1, *cr1, *cb1;
83 + lum1 = dst->data[0];
87 + for (; height >= 1; height -= 2) {
94 + asm volatile (" subs %[w], %[w], #16\n"
97 + " convert_macroblock_uyvy422_to_yuv420p 16, 1, %[p], %[lum], %[cb], %[cr]\n"
98 + " subs %[w], %[w], #16\n"
101 + " .irp size, 8, 4, 2, 1\n"
102 + " tst %[w], #\\size\n"
104 + " convert_macroblock_uyvy422_to_yuv420p \\size, 1, %[p], %[lum], %[cb], %[cr]\n"
106 + " .endr\n":[w] "+&r" (w),[p] "+&r" (p),[lum] "+&r" (lum),
107 + [cb] "+&r" (cb),[cr] "+&r" (cr)
108 + ::"cc", "memory", "d0", "d1", "d2", "d3");
110 + for (; w >= 2; w -= 2) {
128 + p1 += src->linesize[0];
129 + lum1 += dst->linesize[0];
135 + asm volatile (" subs %[w], %[w], #16\n"
138 + " convert_macroblock_uyvy422_to_yuv420p 16, 0, %[p], %[lum], %[cb], %[cr]\n"
139 + " subs %[w], %[w], #16\n"
142 + " .irp size, 8, 4, 2, 1\n"
143 + " tst %[w], #\\size\n"
145 + " convert_macroblock_uyvy422_to_yuv420p \\size, 0, %[p], %[lum], %[cb], %[cr]\n"
147 + " .endr\n":[w] "+&r" (w),[p] "+&r" (p),[lum] "+&r" (lum),
148 + [cb] "+&r" (cb),[cr] "+&r" (cr)
149 + ::"cc", "memory", "d0", "d1", "d2", "d3");
151 + for (w = width; w >= 2; w -= 2) {
161 + p1 += src->linesize[0];
162 + lum1 += dst->linesize[0];
164 + cb1 += dst->linesize[1];
165 + cr1 += dst->linesize[2];
167 + asm volatile (".purgem convert_macroblock_uyvy422_to_yuv420p\n");
172 /* XXX: totally non optimized */
173 @@ -3311,6 +3463,7 @@ static ConvertEntry convert_table[] = {
176 {PIX_FMT_UYVY422, PIX_FMT_RGB24, uyvy422_to_rgb24_neon},
177 + {PIX_FMT_UYVY422, PIX_FMT_YUV420P, uyvy422_to_yuv420p_neon},
179 {PIX_FMT_UYVY422, PIX_FMT_YUV420P, uyvy422_to_yuv420p},
180 {PIX_FMT_UYVY422, PIX_FMT_YUV422P, uyvy422_to_yuv422p},