Added gst-plugins-base-subtitles0.10-0.10.34 for Meego Harmattan 1.2
[mafwsubrenderer] / gst-plugins-base-subtitles0.10 / debian / patches / 0015-ARM-NEON-optimized-version-of-uyvy422_to_yuv420p-fun.patch
1 From 04c686998b70b616d6c3a13cc694cf0811ba8650 Mon Sep 17 00:00:00 2001
2 From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
3 Date: Tue, 14 Jul 2009 21:50:18 +0300
4 Subject: [PATCH] ARM NEON optimized version of uyvy422_to_yuv420p function
5
6 ---
7  gst/ffmpegcolorspace/imgconvert.c |  153 +++++++++++++++++++++++++++++++++++++
8  1 files changed, 153 insertions(+), 0 deletions(-)
9
10 diff --git a/gst/ffmpegcolorspace/imgconvert.c b/gst/ffmpegcolorspace/imgconvert.c
11 index 2436a20..67cb19d 100644
12 --- a/gst/ffmpegcolorspace/imgconvert.c
13 +++ b/gst/ffmpegcolorspace/imgconvert.c
14 @@ -956,6 +956,158 @@ uyvy422_to_rgb24_neon (AVPicture * dst, const AVPicture * src,
15    }
16  }
17  
18 +/*
19 + * ARM NEON optimized implementation of UYVY -> YV12 convertor
20 + */
21 +static void
22 +uyvy422_to_yuv420p_neon (AVPicture * dst, const AVPicture * src,
23 +    int width, int height)
24 +{
25 +  asm volatile
26 +      (".macro convert_macroblock_uyvy422_to_yuv420p size, store_c, src, lum, cb, cr\n"
27 +      /* load up to 16 source pixels in UYVY format */
28 +      ".if \\size == 16\n" "  vld1.8      {d0, d1, d2, d3}, [\\src]!\n" "  pld [\\src, #256]\n" ".elseif \\size == 8\n" "  vld1.8      {d0, d1}, [\\src]!\n" ".elseif \\size == 4\n" "  vld1.8      {d0}, [\\src]!\n" ".elseif \\size == 2\n" "  vld1.8      {d0[0]}, [\\src]!\n" "  vld1.8      {d0[1]}, [\\src]!\n" "  vld1.8      {d0[2]}, [\\src]!\n" "  vld1.8      {d0[3]}, [\\src]!\n" ".elseif \\size == 1\n" "  vld1.8      {d0[0]}, [\\src]!\n" "  vld1.8      {d0[1]}, [\\src]!\n" "  vld1.8      {d0[2]}, [\\src]!\n" ".else\n" "  .error \"unsupported macroblock size\"\n" ".endif\n" "  vuzp.8      d0, d1\n"    /* d1 - separated Y (first 8 bytes) */
29 +      "  vuzp.8      d2, d3\n"  /* d3 - separated Y (next 8 bytes) */
30 +      "  vuzp.8      d0, d2\n"  /* d0 - separated U, d2 - separated V */
31 +      "  vswp        d1, d2\n"  /* exchange d1 and d2 */
32 +      ".if \\size == 16\n"
33 +      "  vst1.8      {d2, d3}, [\\lum]!\n"
34 +      ".if \\store_c\n"
35 +      "  vst1.8      {d0}, [\\cb]!\n"
36 +      "  vst1.8      {d1}, [\\cr]!\n"
37 +      ".endif\n"
38 +      ".elseif \\size == 8\n"
39 +      "  vst1.8      {d2}, [\\lum]!\n"
40 +      ".if \\store_c\n"
41 +      "  vst1.8      {d0[0]}, [\\cb]!\n"
42 +      "  vst1.8      {d0[1]}, [\\cb]!\n"
43 +      "  vst1.8      {d0[2]}, [\\cb]!\n"
44 +      "  vst1.8      {d0[3]}, [\\cb]!\n"
45 +      "  vst1.8      {d1[0]}, [\\cr]!\n"
46 +      "  vst1.8      {d1[1]}, [\\cr]!\n"
47 +      "  vst1.8      {d1[2]}, [\\cr]!\n"
48 +      "  vst1.8      {d1[3]}, [\\cr]!\n"
49 +      ".endif\n"
50 +      ".elseif \\size == 4\n"
51 +      "  vst1.8      {d2[0]}, [\\lum]!\n"
52 +      "  vst1.8      {d2[1]}, [\\lum]!\n"
53 +      "  vst1.8      {d2[2]}, [\\lum]!\n"
54 +      "  vst1.8      {d2[3]}, [\\lum]!\n"
55 +      ".if \\store_c\n"
56 +      "  vst1.8      {d0[0]}, [\\cb]!\n"
57 +      "  vst1.8      {d0[1]}, [\\cb]!\n"
58 +      "  vst1.8      {d1[0]}, [\\cr]!\n"
59 +      "  vst1.8      {d1[1]}, [\\cr]!\n"
60 +      ".endif\n"
61 +      ".elseif \\size == 2\n"
62 +      "  vst1.8      {d2[0]}, [\\lum]!\n"
63 +      "  vst1.8      {d2[1]}, [\\lum]!\n"
64 +      ".if \\store_c\n"
65 +      "  vst1.8      {d0[0]}, [\\cb]!\n"
66 +      "  vst1.8      {d1[0]}, [\\cr]!\n"
67 +      ".endif\n"
68 +      ".elseif \\size == 1\n"
69 +      "  vst1.8      {d2[0]}, [\\lum]!\n"
70 +      ".if \\store_c\n"
71 +      "  vst1.8      {d0[0]}, [\\cb]!\n"
72 +      "  vst1.8      {d1[0]}, [\\cr]!\n"
73 +      ".endif\n"
74 +      ".else\n"
75 +      "  .error \"unsupported macroblock size\"\n" ".endif\n" ".endm\n");
76 +
77 +  const uint8_t *p, *p1;
78 +  uint8_t *lum, *cr, *cb, *lum1, *cr1, *cb1;
79 +  int w;
80 +
81 +  p1 = src->data[0];
82 +
83 +  lum1 = dst->data[0];
84 +  cb1 = dst->data[1];
85 +  cr1 = dst->data[2];
86 +
87 +  for (; height >= 1; height -= 2) {
88 +    p = p1;
89 +    lum = lum1;
90 +    cb = cb1;
91 +    cr = cr1;
92 +    w = width;
93 +#if 1
94 +    asm volatile ("   subs        %[w], %[w], #16\n"
95 +        "   blt         2f\n"
96 +        "1:\n"
97 +        "   convert_macroblock_uyvy422_to_yuv420p 16, 1, %[p], %[lum], %[cb], %[cr]\n"
98 +        "   subs        %[w], %[w], #16\n"
99 +        "   bge         1b\n"
100 +        "2:\n"
101 +        "   .irp        size, 8, 4, 2, 1\n"
102 +        "   tst         %[w], #\\size\n"
103 +        "   beq         3f\n"
104 +        "   convert_macroblock_uyvy422_to_yuv420p \\size, 1, %[p], %[lum], %[cb], %[cr]\n"
105 +        "3:\n"
106 +        "  .endr\n":[w] "+&r" (w),[p] "+&r" (p),[lum] "+&r" (lum),
107 +        [cb] "+&r" (cb),[cr] "+&r" (cr)
108 +        ::"cc", "memory", "d0", "d1", "d2", "d3");
109 +#else
110 +    for (; w >= 2; w -= 2) {
111 +      lum[0] = p[1];
112 +      cb[0] = p[0];
113 +      lum[1] = p[3];
114 +      cr[0] = p[2];
115 +      p += 4;
116 +      lum += 2;
117 +      cb++;
118 +      cr++;
119 +    }
120 +    if (w) {
121 +      lum[0] = p[1];
122 +      cb[0] = p[0];
123 +      cr[0] = p[2];
124 +      cb++;
125 +      cr++;
126 +    }
127 +#endif
128 +    p1 += src->linesize[0];
129 +    lum1 += dst->linesize[0];
130 +    if (height > 1) {
131 +      p = p1;
132 +      lum = lum1;
133 +      w = width;
134 +#if 1
135 +      asm volatile ("   subs        %[w], %[w], #16\n"
136 +          "   blt         2f\n"
137 +          "1:\n"
138 +          "   convert_macroblock_uyvy422_to_yuv420p 16, 0, %[p], %[lum], %[cb], %[cr]\n"
139 +          "   subs        %[w], %[w], #16\n"
140 +          "   bge         1b\n"
141 +          "2:\n"
142 +          "   .irp        size, 8, 4, 2, 1\n"
143 +          "   tst         %[w], #\\size\n"
144 +          "   beq         3f\n"
145 +          "   convert_macroblock_uyvy422_to_yuv420p \\size, 0, %[p], %[lum], %[cb], %[cr]\n"
146 +          "3:\n"
147 +          "  .endr\n":[w] "+&r" (w),[p] "+&r" (p),[lum] "+&r" (lum),
148 +          [cb] "+&r" (cb),[cr] "+&r" (cr)
149 +          ::"cc", "memory", "d0", "d1", "d2", "d3");
150 +#else
151 +      for (w = width; w >= 2; w -= 2) {
152 +        lum[0] = p[1];
153 +        lum[1] = p[3];
154 +        p += 4;
155 +        lum += 2;
156 +      }
157 +      if (w) {
158 +        lum[0] = p[1];
159 +      }
160 +#endif
161 +      p1 += src->linesize[0];
162 +      lum1 += dst->linesize[0];
163 +    }
164 +    cb1 += dst->linesize[1];
165 +    cr1 += dst->linesize[2];
166 +  }
167 +  asm volatile (".purgem convert_macroblock_uyvy422_to_yuv420p\n");
168 +}
169 +
170  #endif
171  
172  /* XXX: totally non optimized */
173 @@ -3311,6 +3463,7 @@ static ConvertEntry convert_table[] = {
174  
175  #ifdef __ARM_NEON__
176    {PIX_FMT_UYVY422, PIX_FMT_RGB24, uyvy422_to_rgb24_neon},
177 +  {PIX_FMT_UYVY422, PIX_FMT_YUV420P, uyvy422_to_yuv420p_neon},
178  #endif
179    {PIX_FMT_UYVY422, PIX_FMT_YUV420P, uyvy422_to_yuv420p},
180    {PIX_FMT_UYVY422, PIX_FMT_YUV422P, uyvy422_to_yuv422p},