+void helper_vmsummbm (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+{
+ int32_t prod[16];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(r->s8); i++) {
+ prod[i] = (int32_t)a->s8[i] * b->u8[i];
+ }
+
+ VECTOR_FOR_INORDER_I(i, s32) {
+ r->s32[i] = c->s32[i] + prod[4*i] + prod[4*i+1] + prod[4*i+2] + prod[4*i+3];
+ }
+}
+
+void helper_vmsumshm (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+{
+ int32_t prod[8];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(r->s16); i++) {
+ prod[i] = a->s16[i] * b->s16[i];
+ }
+
+ VECTOR_FOR_INORDER_I(i, s32) {
+ r->s32[i] = c->s32[i] + prod[2*i] + prod[2*i+1];
+ }
+}
+
+void helper_vmsumshs (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+{
+ int32_t prod[8];
+ int i;
+ int sat = 0;
+
+ for (i = 0; i < ARRAY_SIZE(r->s16); i++) {
+ prod[i] = (int32_t)a->s16[i] * b->s16[i];
+ }
+
+ VECTOR_FOR_INORDER_I (i, s32) {
+ int64_t t = (int64_t)c->s32[i] + prod[2*i] + prod[2*i+1];
+ r->u32[i] = cvtsdsw(t, &sat);
+ }
+
+ if (sat) {
+ env->vscr |= (1 << VSCR_SAT);
+ }
+}
+
+void helper_vmsumubm (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+{
+ uint16_t prod[16];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
+ prod[i] = a->u8[i] * b->u8[i];
+ }
+
+ VECTOR_FOR_INORDER_I(i, u32) {
+ r->u32[i] = c->u32[i] + prod[4*i] + prod[4*i+1] + prod[4*i+2] + prod[4*i+3];
+ }
+}
+
+void helper_vmsumuhm (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+{
+ uint32_t prod[8];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(r->u16); i++) {
+ prod[i] = a->u16[i] * b->u16[i];
+ }
+
+ VECTOR_FOR_INORDER_I(i, u32) {
+ r->u32[i] = c->u32[i] + prod[2*i] + prod[2*i+1];
+ }
+}
+
+void helper_vmsumuhs (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+{
+ uint32_t prod[8];
+ int i;
+ int sat = 0;
+
+ for (i = 0; i < ARRAY_SIZE(r->u16); i++) {
+ prod[i] = a->u16[i] * b->u16[i];
+ }
+
+ VECTOR_FOR_INORDER_I (i, s32) {
+ uint64_t t = (uint64_t)c->u32[i] + prod[2*i] + prod[2*i+1];
+ r->u32[i] = cvtuduw(t, &sat);
+ }
+
+ if (sat) {
+ env->vscr |= (1 << VSCR_SAT);
+ }
+}
+
+#define VMUL_DO(name, mul_element, prod_element, evenp) \
+ void helper_v##name (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
+ { \
+ int i; \
+ VECTOR_FOR_INORDER_I(i, prod_element) { \
+ if (evenp) { \
+ r->prod_element[i] = a->mul_element[i*2+HI_IDX] * b->mul_element[i*2+HI_IDX]; \
+ } else { \
+ r->prod_element[i] = a->mul_element[i*2+LO_IDX] * b->mul_element[i*2+LO_IDX]; \
+ } \
+ } \
+ }
+#define VMUL(suffix, mul_element, prod_element) \
+ VMUL_DO(mule##suffix, mul_element, prod_element, 1) \
+ VMUL_DO(mulo##suffix, mul_element, prod_element, 0)
+VMUL(sb, s8, s16)
+VMUL(sh, s16, s32)
+VMUL(ub, u8, u16)
+VMUL(uh, u16, u32)
+#undef VMUL_DO
+#undef VMUL
+
+void helper_vnmsubfp (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(r->f); i++) {
+ HANDLE_NAN3(r->f[i], a->f[i], b->f[i], c->f[i]) {
+ /* Need to do the computation is higher precision and round
+ * once at the end. */
+ float64 af, bf, cf, t;
+ af = float32_to_float64(a->f[i], &env->vec_status);
+ bf = float32_to_float64(b->f[i], &env->vec_status);
+ cf = float32_to_float64(c->f[i], &env->vec_status);
+ t = float64_mul(af, cf, &env->vec_status);
+ t = float64_sub(t, bf, &env->vec_status);
+ t = float64_chs(t);
+ r->f[i] = float64_to_float32(t, &env->vec_status);
+ }
+ }
+}
+
+void helper_vperm (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+{
+ ppc_avr_t result;
+ int i;
+ VECTOR_FOR_INORDER_I (i, u8) {
+ int s = c->u8[i] & 0x1f;
+#if defined(WORDS_BIGENDIAN)
+ int index = s & 0xf;
+#else
+ int index = 15 - (s & 0xf);
+#endif
+ if (s & 0x10) {
+ result.u8[i] = b->u8[index];
+ } else {
+ result.u8[i] = a->u8[index];
+ }
+ }
+ *r = result;
+}
+
+#if defined(WORDS_BIGENDIAN)
+#define PKBIG 1
+#else
+#define PKBIG 0
+#endif
+void helper_vpkpx (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ int i, j;
+ ppc_avr_t result;
+#if defined(WORDS_BIGENDIAN)
+ const ppc_avr_t *x[2] = { a, b };
+#else
+ const ppc_avr_t *x[2] = { b, a };
+#endif
+
+ VECTOR_FOR_INORDER_I (i, u64) {
+ VECTOR_FOR_INORDER_I (j, u32){
+ uint32_t e = x[i]->u32[j];
+ result.u16[4*i+j] = (((e >> 9) & 0xfc00) |
+ ((e >> 6) & 0x3e0) |
+ ((e >> 3) & 0x1f));
+ }
+ }
+ *r = result;
+}
+
+#define VPK(suffix, from, to, cvt, dosat) \
+ void helper_vpk##suffix (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
+ { \
+ int i; \
+ int sat = 0; \
+ ppc_avr_t result; \
+ ppc_avr_t *a0 = PKBIG ? a : b; \
+ ppc_avr_t *a1 = PKBIG ? b : a; \
+ VECTOR_FOR_INORDER_I (i, from) { \
+ result.to[i] = cvt(a0->from[i], &sat); \
+ result.to[i+ARRAY_SIZE(r->from)] = cvt(a1->from[i], &sat); \
+ } \
+ *r = result; \
+ if (dosat && sat) { \
+ env->vscr |= (1 << VSCR_SAT); \
+ } \
+ }
+#define I(x, y) (x)
+VPK(shss, s16, s8, cvtshsb, 1)
+VPK(shus, s16, u8, cvtshub, 1)
+VPK(swss, s32, s16, cvtswsh, 1)
+VPK(swus, s32, u16, cvtswuh, 1)
+VPK(uhus, u16, u8, cvtuhub, 1)
+VPK(uwus, u32, u16, cvtuwuh, 1)
+VPK(uhum, u16, u8, I, 0)
+VPK(uwum, u32, u16, I, 0)
+#undef I
+#undef VPK
+#undef PKBIG
+
+void helper_vrefp (ppc_avr_t *r, ppc_avr_t *b)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(r->f); i++) {
+ HANDLE_NAN1(r->f[i], b->f[i]) {
+ r->f[i] = float32_div(float32_one, b->f[i], &env->vec_status);
+ }
+ }
+}
+
+#define VRFI(suffix, rounding) \
+ void helper_vrfi##suffix (ppc_avr_t *r, ppc_avr_t *b) \
+ { \
+ int i; \
+ float_status s = env->vec_status; \
+ set_float_rounding_mode(rounding, &s); \
+ for (i = 0; i < ARRAY_SIZE(r->f); i++) { \
+ HANDLE_NAN1(r->f[i], b->f[i]) { \
+ r->f[i] = float32_round_to_int (b->f[i], &s); \
+ } \
+ } \
+ }
+VRFI(n, float_round_nearest_even)
+VRFI(m, float_round_down)
+VRFI(p, float_round_up)
+VRFI(z, float_round_to_zero)
+#undef VRFI
+
+#define VROTATE(suffix, element) \
+ void helper_vrl##suffix (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
+ { \
+ int i; \
+ for (i = 0; i < ARRAY_SIZE(r->element); i++) { \
+ unsigned int mask = ((1 << (3 + (sizeof (a->element[0]) >> 1))) - 1); \
+ unsigned int shift = b->element[i] & mask; \
+ r->element[i] = (a->element[i] << shift) | (a->element[i] >> (sizeof(a->element[0]) * 8 - shift)); \
+ } \
+ }
+VROTATE(b, u8)
+VROTATE(h, u16)
+VROTATE(w, u32)
+#undef VROTATE
+
+void helper_vrsqrtefp (ppc_avr_t *r, ppc_avr_t *b)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(r->f); i++) {
+ HANDLE_NAN1(r->f[i], b->f[i]) {
+ float32 t = float32_sqrt(b->f[i], &env->vec_status);
+ r->f[i] = float32_div(float32_one, t, &env->vec_status);
+ }
+ }
+}
+
+void helper_vsel (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, ppc_avr_t *c)
+{
+ r->u64[0] = (a->u64[0] & ~c->u64[0]) | (b->u64[0] & c->u64[0]);
+ r->u64[1] = (a->u64[1] & ~c->u64[1]) | (b->u64[1] & c->u64[1]);
+}
+
+void helper_vlogefp (ppc_avr_t *r, ppc_avr_t *b)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(r->f); i++) {
+ HANDLE_NAN1(r->f[i], b->f[i]) {
+ r->f[i] = float32_log2(b->f[i], &env->vec_status);
+ }
+ }
+}
+
+#if defined(WORDS_BIGENDIAN)
+#define LEFT 0
+#define RIGHT 1
+#else
+#define LEFT 1
+#define RIGHT 0
+#endif
+/* The specification says that the results are undefined if all of the
+ * shift counts are not identical. We check to make sure that they are
+ * to conform to what real hardware appears to do. */
+#define VSHIFT(suffix, leftp) \
+ void helper_vs##suffix (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
+ { \
+ int shift = b->u8[LO_IDX*15] & 0x7; \
+ int doit = 1; \
+ int i; \
+ for (i = 0; i < ARRAY_SIZE(r->u8); i++) { \
+ doit = doit && ((b->u8[i] & 0x7) == shift); \
+ } \
+ if (doit) { \
+ if (shift == 0) { \
+ *r = *a; \
+ } else if (leftp) { \
+ uint64_t carry = a->u64[LO_IDX] >> (64 - shift); \
+ r->u64[HI_IDX] = (a->u64[HI_IDX] << shift) | carry; \
+ r->u64[LO_IDX] = a->u64[LO_IDX] << shift; \
+ } else { \
+ uint64_t carry = a->u64[HI_IDX] << (64 - shift); \
+ r->u64[LO_IDX] = (a->u64[LO_IDX] >> shift) | carry; \
+ r->u64[HI_IDX] = a->u64[HI_IDX] >> shift; \
+ } \
+ } \
+ }
+VSHIFT(l, LEFT)
+VSHIFT(r, RIGHT)
+#undef VSHIFT
+#undef LEFT
+#undef RIGHT
+
+#define VSL(suffix, element) \
+ void helper_vsl##suffix (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
+ { \
+ int i; \
+ for (i = 0; i < ARRAY_SIZE(r->element); i++) { \
+ unsigned int mask = ((1 << (3 + (sizeof (a->element[0]) >> 1))) - 1); \
+ unsigned int shift = b->element[i] & mask; \
+ r->element[i] = a->element[i] << shift; \
+ } \
+ }
+VSL(b, u8)
+VSL(h, u16)
+VSL(w, u32)
+#undef VSL
+
+void helper_vsldoi (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b, uint32_t shift)
+{
+ int sh = shift & 0xf;
+ int i;
+ ppc_avr_t result;
+
+#if defined(WORDS_BIGENDIAN)
+ for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
+ int index = sh + i;
+ if (index > 0xf) {
+ result.u8[i] = b->u8[index-0x10];
+ } else {
+ result.u8[i] = a->u8[index];
+ }
+ }
+#else
+ for (i = 0; i < ARRAY_SIZE(r->u8); i++) {
+ int index = (16 - sh) + i;
+ if (index > 0xf) {
+ result.u8[i] = a->u8[index-0x10];
+ } else {
+ result.u8[i] = b->u8[index];
+ }
+ }
+#endif
+ *r = result;
+}
+
+void helper_vslo (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ int sh = (b->u8[LO_IDX*0xf] >> 3) & 0xf;
+
+#if defined (WORDS_BIGENDIAN)
+ memmove (&r->u8[0], &a->u8[sh], 16-sh);
+ memset (&r->u8[16-sh], 0, sh);
+#else
+ memmove (&r->u8[sh], &a->u8[0], 16-sh);
+ memset (&r->u8[0], 0, sh);
+#endif
+}
+
+/* Experimental testing shows that hardware masks the immediate. */
+#define _SPLAT_MASKED(element) (splat & (ARRAY_SIZE(r->element) - 1))
+#if defined(WORDS_BIGENDIAN)
+#define SPLAT_ELEMENT(element) _SPLAT_MASKED(element)
+#else
+#define SPLAT_ELEMENT(element) (ARRAY_SIZE(r->element)-1 - _SPLAT_MASKED(element))
+#endif
+#define VSPLT(suffix, element) \
+ void helper_vsplt##suffix (ppc_avr_t *r, ppc_avr_t *b, uint32_t splat) \
+ { \
+ uint32_t s = b->element[SPLAT_ELEMENT(element)]; \
+ int i; \
+ for (i = 0; i < ARRAY_SIZE(r->element); i++) { \
+ r->element[i] = s; \
+ } \
+ }
+VSPLT(b, u8)
+VSPLT(h, u16)
+VSPLT(w, u32)
+#undef VSPLT
+#undef SPLAT_ELEMENT
+#undef _SPLAT_MASKED
+
+#define VSPLTI(suffix, element, splat_type) \
+ void helper_vspltis##suffix (ppc_avr_t *r, uint32_t splat) \
+ { \
+ splat_type x = (int8_t)(splat << 3) >> 3; \
+ int i; \
+ for (i = 0; i < ARRAY_SIZE(r->element); i++) { \
+ r->element[i] = x; \
+ } \
+ }
+VSPLTI(b, s8, int8_t)
+VSPLTI(h, s16, int16_t)
+VSPLTI(w, s32, int32_t)
+#undef VSPLTI
+
+#define VSR(suffix, element) \
+ void helper_vsr##suffix (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
+ { \
+ int i; \
+ for (i = 0; i < ARRAY_SIZE(r->element); i++) { \
+ unsigned int mask = ((1 << (3 + (sizeof (a->element[0]) >> 1))) - 1); \
+ unsigned int shift = b->element[i] & mask; \
+ r->element[i] = a->element[i] >> shift; \
+ } \
+ }
+VSR(ab, s8)
+VSR(ah, s16)
+VSR(aw, s32)
+VSR(b, u8)
+VSR(h, u16)
+VSR(w, u32)
+#undef VSR
+
+void helper_vsro (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ int sh = (b->u8[LO_IDX*0xf] >> 3) & 0xf;
+
+#if defined (WORDS_BIGENDIAN)
+ memmove (&r->u8[sh], &a->u8[0], 16-sh);
+ memset (&r->u8[0], 0, sh);
+#else
+ memmove (&r->u8[0], &a->u8[sh], 16-sh);
+ memset (&r->u8[16-sh], 0, sh);
+#endif
+}
+
+void helper_vsubcuw (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
+ r->u32[i] = a->u32[i] >= b->u32[i];
+ }
+}
+
+void helper_vsumsws (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ int64_t t;
+ int i, upper;
+ ppc_avr_t result;
+ int sat = 0;
+
+#if defined(WORDS_BIGENDIAN)
+ upper = ARRAY_SIZE(r->s32)-1;
+#else
+ upper = 0;
+#endif
+ t = (int64_t)b->s32[upper];
+ for (i = 0; i < ARRAY_SIZE(r->s32); i++) {
+ t += a->s32[i];
+ result.s32[i] = 0;
+ }
+ result.s32[upper] = cvtsdsw(t, &sat);
+ *r = result;
+
+ if (sat) {
+ env->vscr |= (1 << VSCR_SAT);
+ }
+}
+
+void helper_vsum2sws (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ int i, j, upper;
+ ppc_avr_t result;
+ int sat = 0;
+
+#if defined(WORDS_BIGENDIAN)
+ upper = 1;
+#else
+ upper = 0;
+#endif
+ for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
+ int64_t t = (int64_t)b->s32[upper+i*2];
+ result.u64[i] = 0;
+ for (j = 0; j < ARRAY_SIZE(r->u64); j++) {
+ t += a->s32[2*i+j];
+ }
+ result.s32[upper+i*2] = cvtsdsw(t, &sat);
+ }
+
+ *r = result;
+ if (sat) {
+ env->vscr |= (1 << VSCR_SAT);
+ }
+}
+
+void helper_vsum4sbs (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ int i, j;
+ int sat = 0;
+
+ for (i = 0; i < ARRAY_SIZE(r->s32); i++) {
+ int64_t t = (int64_t)b->s32[i];
+ for (j = 0; j < ARRAY_SIZE(r->s32); j++) {
+ t += a->s8[4*i+j];
+ }
+ r->s32[i] = cvtsdsw(t, &sat);
+ }
+
+ if (sat) {
+ env->vscr |= (1 << VSCR_SAT);
+ }
+}
+
+void helper_vsum4shs (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ int sat = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(r->s32); i++) {
+ int64_t t = (int64_t)b->s32[i];
+ t += a->s16[2*i] + a->s16[2*i+1];
+ r->s32[i] = cvtsdsw(t, &sat);
+ }
+
+ if (sat) {
+ env->vscr |= (1 << VSCR_SAT);
+ }
+}
+
+void helper_vsum4ubs (ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
+{
+ int i, j;
+ int sat = 0;
+
+ for (i = 0; i < ARRAY_SIZE(r->u32); i++) {
+ uint64_t t = (uint64_t)b->u32[i];
+ for (j = 0; j < ARRAY_SIZE(r->u32); j++) {
+ t += a->u8[4*i+j];
+ }
+ r->u32[i] = cvtuduw(t, &sat);
+ }
+
+ if (sat) {
+ env->vscr |= (1 << VSCR_SAT);
+ }
+}
+
+#if defined(WORDS_BIGENDIAN)
+#define UPKHI 1
+#define UPKLO 0
+#else
+#define UPKHI 0
+#define UPKLO 1
+#endif
+#define VUPKPX(suffix, hi) \
+ void helper_vupk##suffix (ppc_avr_t *r, ppc_avr_t *b) \
+ { \
+ int i; \
+ ppc_avr_t result; \
+ for (i = 0; i < ARRAY_SIZE(r->u32); i++) { \
+ uint16_t e = b->u16[hi ? i : i+4]; \
+ uint8_t a = (e >> 15) ? 0xff : 0; \
+ uint8_t r = (e >> 10) & 0x1f; \
+ uint8_t g = (e >> 5) & 0x1f; \
+ uint8_t b = e & 0x1f; \
+ result.u32[i] = (a << 24) | (r << 16) | (g << 8) | b; \
+ } \
+ *r = result; \
+ }
+VUPKPX(lpx, UPKLO)
+VUPKPX(hpx, UPKHI)
+#undef VUPKPX
+
+#define VUPK(suffix, unpacked, packee, hi) \
+ void helper_vupk##suffix (ppc_avr_t *r, ppc_avr_t *b) \
+ { \
+ int i; \
+ ppc_avr_t result; \
+ if (hi) { \
+ for (i = 0; i < ARRAY_SIZE(r->unpacked); i++) { \
+ result.unpacked[i] = b->packee[i]; \
+ } \
+ } else { \
+ for (i = ARRAY_SIZE(r->unpacked); i < ARRAY_SIZE(r->packee); i++) { \
+ result.unpacked[i-ARRAY_SIZE(r->unpacked)] = b->packee[i]; \
+ } \
+ } \
+ *r = result; \
+ }
+VUPK(hsb, s16, s8, UPKHI)
+VUPK(hsh, s32, s16, UPKHI)
+VUPK(lsb, s16, s8, UPKLO)
+VUPK(lsh, s32, s16, UPKLO)
+#undef VUPK
+#undef UPKHI
+#undef UPKLO
+
+#undef DO_HANDLE_NAN
+#undef HANDLE_NAN1
+#undef HANDLE_NAN2
+#undef HANDLE_NAN3