--- /dev/null
+#include "video_utils_p5p.h"
+#include <VLIB/video_picture_defines.h>
+
+#ifdef HAS_VIDEO_BLOCKLINE_TO_MACRO_BLOCKS
+
+#include "config-tcm.h"
+
+ .section ".text.itcm","ax"
+ .global video_blockline_to_macro_blocks
+ .global video_blockline_patch_block_1
+ .global video_blockline_patch_block_2_start
+ .global video_blockline_patch_block_2
+ .global video_blockline_patch_block_3_start
+ .global video_blockline_patch_block_3
+ .global video_blockline_patch_block_4_start
+ .global video_blockline_patch_block_4
+ .global video_blockline_patch_fix_y
+ .global video_blockline_patch_block_cb
+ .global video_blockline_patch_fix_cb
+ .global video_blockline_patch_block_cr
+ .global video_blockline_patch_fix_cr
+ .type video_blockline_to_macro_blocks, %function
+
+/* Registers usage
+ r0 : ctx
+ r1 : dst
+ r2 : num_macro_blocks
+ r3 : y_src
+ r4 : cb_src
+ r5 : cr_src
+ r6, r7, r8, r9 : Pixels in 16 bits format (write in dst)
+ r10, r11 : Pixels in 8 bits format (read from y_src, cb_src or cr_src)
+ ip/r12 : not used for instance
+ lr/r14 : line counter in internal loop (8 lines per block)
+*/
+
+video_blockline_to_macro_blocks:
+ stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+ ldm r0, { r3, r4, r5 }
+
+video_blockline_to_macro_blocks_loop:
+
+ @@ Luminances
+ @@ Copy first block
+ mov lr, #8
+
+copy_block_1: @ data conversion from 8 bits to 16 bits
+
+ ldmia r3!, {r10-r11} @ Get 8 pixels from y_src
+
+ @ isolate the first four pixel
+ and r6, r10, #0x00FF
+ and r8, r10, #0xFF00
+
+ mov r10, r10, LSR #16
+
+ and r7, r10, #0x00FF
+ and r9, r10, #0xFF00
+
+ @ Combine them
+ orr r6, r8, LSL #8
+ orr r7, r9, LSL #8
+
+ @ isolate next four pixels
+ and r8, r11, #0x00FF
+ and r10, r11, #0xFF00
+
+ mov r11, r11, LSR #16
+
+ and r9, r11, #0x00FF
+ and r11, r11, #0xFF00
+
+ @ Combine them
+ orr r8, r10, LSL #8
+ orr r9, r11, LSL #8
+
+ @ Store result of conversion to dst
+ stmia r1!, {r6-r9}
+
+ subs lr, lr, #1
+
+ @ Proceed to next line
+video_blockline_patch_block_1:
+ addne r3, #(0)
+ bne copy_block_1
+
+
+ @@ Copy second block
+video_blockline_patch_block_2_start:
+ sub r3, #(0)
+ mov lr, #8
+
+copy_block_2: @ data conversion from 8 bits to 16 bits
+
+ ldm r3, {r10-r11} @ Get 8 pixels from y_src
+
+ @ isolate the first four pixel
+ and r6, r10, #0x00FF
+ and r8, r10, #0xFF00
+
+ mov r10, r10, LSR #16
+
+ and r7, r10, #0x00FF
+ and r9, r10, #0xFF00
+
+ @ Combine them
+ orr r6, r8, LSL #8
+ orr r7, r9, LSL #8
+
+ @ isolate next four pixels
+ and r8, r11, #0x00FF
+ and r10, r11, #0xFF00
+
+ mov r11, r11, LSR #16
+
+ and r9, r11, #0x00FF
+ and r11, r11, #0xFF00
+
+ @ Combine them
+ orr r8, r10, LSL #8
+ orr r9, r11, LSL #8
+
+ @ Store result of conversion to dst
+ stmia r1!, {r6-r9}
+
+ subs lr, lr, #1
+
+ @ Proceed to next line
+video_blockline_patch_block_2:
+ addne r3, #(0)
+ bne copy_block_2
+
+
+ @@ Copy third block
+video_blockline_patch_block_3_start:
+ add r3, #(0)
+ mov lr, #8
+
+copy_block_3: @ data conversion from 8 bits to 16 bits
+
+ ldmia r3!, {r10-r11} @ Get 8 pixels from y_src
+
+ @ isolate the first four pixel
+ and r6, r10, #0x00FF
+ and r8, r10, #0xFF00
+
+ mov r10, r10, LSR #16
+
+ and r7, r10, #0x00FF
+ and r9, r10, #0xFF00
+
+ @ Combine them
+ orr r6, r8, LSL #8
+ orr r7, r9, LSL #8
+
+ @ isolate next four pixels
+ and r8, r11, #0x00FF
+ and r10, r11, #0xFF00
+
+ mov r11, r11, LSR #16
+
+ and r9, r11, #0x00FF
+ and r11, r11, #0xFF00
+
+ @ Combine them
+ orr r8, r10, LSL #8
+ orr r9, r11, LSL #8
+
+ @ Store result of conversion to dst
+ stmia r1!, {r6-r9}
+
+ subs lr, lr, #1
+
+ @ Proceed to next line
+video_blockline_patch_block_3:
+ addne r3, #(0)
+ bne copy_block_3
+
+
+ @@ Copy fourth block
+video_blockline_patch_block_4_start:
+ sub r3, #(0)
+ mov lr, #8
+
+copy_block_4: @ data conversion from 8 bits to 16 bits
+
+ ldmia r3!, {r10-r11} @ Get 8 pixels from y_src
+
+ @ isolate the first four pixel
+ and r6, r10, #0x00FF
+ and r8, r10, #0xFF00
+
+ mov r10, r10, LSR #16
+
+ and r7, r10, #0x00FF
+ and r9, r10, #0xFF00
+
+ @ Combine them
+ orr r6, r8, LSL #8
+ orr r7, r9, LSL #8
+
+ @ isolate next four pixels
+ and r8, r11, #0x00FF
+ and r10, r11, #0xFF00
+
+ mov r11, r11, LSR #16
+
+ and r9, r11, #0x00FF
+ and r11, r11, #0xFF00
+
+ @ Combine them
+ orr r8, r10, LSL #8
+ orr r9, r11, LSL #8
+
+ @ Store result of conversion to dst
+ stmia r1!, {r6-r9}
+
+ subs lr, lr, #1
+
+ @ Proceed to next line
+video_blockline_patch_block_4:
+ addne r3, #(0)
+ bne copy_block_4
+
+video_blockline_patch_fix_y:
+ sub r3, #(0) @ Fix r3 for next iteration
+
+
+ @@ Chrominances
+ @@ Copy fifth block
+ mov lr, #8
+
+copy_block_cb: @ data conversion from 8 bits to 16 bits
+
+ ldmia r4!, {r10-r11} @ Get 8 pixels from cb_src
+
+ @ isolate the first four pixel
+ and r6, r10, #0x00FF
+ and r8, r10, #0xFF00
+
+ mov r10, r10, LSR #16
+
+ and r7, r10, #0x00FF
+ and r9, r10, #0xFF00
+
+ @ Combine them
+ orr r6, r8, LSL #8
+ orr r7, r9, LSL #8
+
+ @ isolate next four pixels
+ and r8, r11, #0x00FF
+ and r10, r11, #0xFF00
+
+ mov r11, r11, LSR #16
+
+ and r9, r11, #0x00FF
+ and r11, r11, #0xFF00
+
+ @ Combine them
+ orr r8, r10, LSL #8
+ orr r9, r11, LSL #8
+
+ @ Store result of conversion to dst
+ stmia r1!, {r6-r9}
+
+ subs lr, lr, #1
+
+ @ Proceed to next line
+video_blockline_patch_block_cb:
+ addne r4, #(0)
+ bne copy_block_cb
+
+video_blockline_patch_fix_cb:
+ sub r4, #(0) @ Fix r4 for next iteration
+
+
+ @@ Copy sixth block
+ mov lr, #8
+
+copy_block_cr: @ data conversion from 8 bits to 16 bits
+
+ ldmia r5!, {r10-r11} @ Get 8 pixels from cr_src
+
+ @ isolate the first four pixel
+ and r6, r10, #0x00FF
+ and r8, r10, #0xFF00
+
+ mov r10, r10, LSR #16
+
+ and r7, r10, #0x00FF
+ and r9, r10, #0xFF00
+
+ @ Combine them
+ orr r6, r8, LSL #8
+ orr r7, r9, LSL #8
+
+ @ isolate next four pixels
+ and r8, r11, #0x00FF
+ and r10, r11, #0xFF00
+
+ mov r11, r11, LSR #16
+
+ and r9, r11, #0x00FF
+ and r11, r11, #0xFF00
+
+ @ Combine them
+ orr r8, r10, LSL #8
+ orr r9, r11, LSL #8
+
+ @ Store result of conversion to dst
+ stmia r1!, {r6-r9}
+
+ subs lr, lr, #1
+
+ @ Proceed to next line
+video_blockline_patch_block_cr:
+ addne r5, #(0)
+ bne copy_block_cr
+
+video_blockline_patch_fix_cr:
+ sub r5, #(0) @ Fix r5 for next iteration
+
+
+ subs r2, r2, #1
+ bne video_blockline_to_macro_blocks_loop
+
+ stm r0, { r3, r4, r5 }
+ ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+#endif // HAS_VIDEO_BLOCKLINE_TO_MACRO_BLOCKS