Index: firmware/target/arm/ipod/lcd-as-color-nano.S
===================================================================
--- firmware/target/arm/ipod/lcd-as-color-nano.S  (revision 28934)
+++ firmware/target/arm/ipod/lcd-as-color-nano.S  (working copy)
@@ -24,129 +24,245 @@
 #include "config.h"
 #include "cpu.h"

-    .section    .icode, "ax", %progbits
+/**************************************************************************** 
+ * #define FORCE_FIFO_WAIT
+ *
+ * This is not needed in YUV blitting when the LCD IF is fast enough. In this
+ * case YUV-to-RGB conversion per pixel needs longer than the transfer of a 
+ * pixel via the LCD IF. For iPod nano 2G this is true if the LCD IF is 
+ * configured to use LCD_PHTIME = 0x00 (see lcd-nano2g.c).
+ ****************************************************************************/

+#include "config.h"
+
+#define FORCE_FIFO_WAIT
+
+    .section .icode, "ax", %progbits
+    
 /****************************************************************************
-*  void lcd_yuv_write_inner_loop(unsigned char const * const ysrc,
-*                                unsigned char const * const usrc,
-*                                unsigned char const * const vsrc,
-*                                int width);
-*
-*   YUV- > RGB565 conversion
-*   |R|   |1.000000 -0.000001  1.402000| |Y'|
-*   |G| = |1.000000 -0.334136 -0.714136| |Pb|
-*   |B|   |1.000000  1.772000  0.000000| |Pr|
-*   Scaled, normalized, rounded and tweaked to yield RGB 565:
-*   |R|   |74   0 101| |Y' -  16| >> 9
-*   |G| = |74 -24 -51| |Cb - 128| >> 8
-*   |B|   |74 128   0| |Cr - 128| >> 9
-*
-*/
+ * extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
+ *                                    const unsigned LCD_BASE,
+ *                                    int width,
+ *                                    int stride);
+ *
+ *   Conversion from Motion JPEG and MPEG Y'PbPr to RGB is:
+ *   |R|   |1.164  0.000  1.596| |Y' -  16|
+ *   |G| = |1.164 -0.391 -0.813| |Pb - 128|
+ *   |B|   |1.164  2.018  0.000| |Pr - 128|
+ *
+ *   Scaled, normalized, rounded and tweaked to yield RGB 565:
+ *   |R|   |74   0 101| |Y' -  16| >> 9
+ *   |G| = |74 -24 -51| |Cb - 128| >> 8
+ *   |B|   |74 128   0| |Cr - 128| >> 9
+ *
+ * Converts two lines from YUV to RGB565 and writes to LCD at once. First loop
+ * loads Cb/Cr, calculates the chroma offset and saves them to buffer. Within
+ * the second loop these chroma offset are reloaded from buffer. Within each 
+ * loop two pixels are calculated and written to LCD. 
+ */
     .align      2
-    .global     lcd_yuv_write_inner_loop
-    .type       lcd_yuv_write_inner_loop, %function
+    .global     lcd_write_yuv420_lines
+    .type       lcd_write_yuv420_lines, %function
+lcd_write_yuv420_lines:
+                                      /* r0 = src = yuv_src */
+                                      /* r1 = dst = LCD_BASE */
+                                      /* r2 = width */
+                                      /* r3 = stride */                
+    stmfd       sp!, { r4-r10, lr }   /* save non-scratch */
+    ldmia       r0, { r9, r10, r12 }  /* r9 = yuv_src[0] = Y'_p */
+                                      /* r10 = yuv_src[1] = Cb_p */
+                                      /* r12 = yuv_src[2] = Cr_p */
+    add         r3, r9, r3            /* r3 = &ysrc[stride] */
+    add         r4, r2, r2, asr #1    /* chroma buffer lenght = width/2 *3 */
+    mov         r4, r4, asl #2        /*   use words for str/ldm possibility */
+    add         r4, r4, #19           /*   plus room for 4 additional words, */
+    bic         r4, r4, #3            /*   rounded up to multiples of 4 byte */
+    sub         sp, sp, r4            /*   and allocate on stack */
+    stmia       sp, {r1-r4}           /* LCD_BASE, width, &ysrc[stride], stack_alloc */

-lcd_yuv_write_inner_loop:
-                                        @ r0 = ysrc
-                                        @ r1 = usrc
-                                        @ r2 = vsrc
-                                        @ r3 = width
-    stmfd sp!, { r4-r11, lr }           @ save regs
-    mov r4, #0x70000000                 @ r4 = LCD2_BLOCK_CTRL - 0x20
-    add r4, r4, #0x8a00                 @
-    add r5, r4, #0x100                  @ r5 = LCD2_BLOCK_DATA
-10:                                     @ loop
+    mov         r7, r2                /* r7 = loop count */
+    add         r8, sp, #16           /* chroma buffer */
+    add         lr, r1, #0x100        /* LCD data port = LCD_BASE + 0x100 */

-    ldrb r7, [r1], #1                   @ *usrc++
-    ldrb r8, [r2], #1                   @ *vsrc++
+    /* 1st loop start */
+10:                                   /* loop start */

-    sub r7, r7, #128                    @ Cb -= 128
-    sub r8, r8, #128                    @ Cr -= 128
+    ldrb        r0, [r10], #1         /* r0 = *usrc++ = *Cb_p++ */
+    ldrb        r1, [r12], #1         /* r1 = *vsrc++ = *Cr_p++ */

-    add r10, r8, r8, asl #2             @ Cr*101
-    add r10, r10, r8, asl #5
-    add r10, r10, r8, asl #6
+    sub         r0, r0, #128          /* r0 = Cb-128 */
+    sub         r1, r1, #128          /* r1 = Cr-128 */

-    add r11, r8, r8, asl #1             @ Cr*51 + Cb*24
-    add r11, r11, r11, asl #4
-    add r11, r11, r7, asl #3
-    add r11, r11, r7, asl #4
+    add         r2, r1, r1, asl #1    /* r2 = Cr*51 + Cb*24 */
+    add         r2, r2, r2, asl #4   
+    add         r2, r2, r0, asl #3   
+    add         r2, r2, r0, asl #4   

-    add r12, r7, #2                     @ r12 = bu = (Cb*128 + 256) >> 9
-    mov r12, r12, asr #2
-    add r10, r10, #256                  @ r10 = rv = (Cr*101 + 256) >> 9
-    mov r10, r10, asr #9
-    rsb r11, r11, #128                  @ r11 = guv = (-r11 + 128) >> 8
-    mov r11, r11, asr #8
+    add         r4, r1, r1, asl #2    /* r1 = Cr*101 */
+    add         r4, r4, r1, asl #5
+    add         r1, r4, r1, asl #6

-@ pixel_1
-    ldrb r7, [r0], #1                   @ *ysrc++
-    sub r7, r7, #16                     @ Y = (Y' - 16) * 37
-    add r8, r7, r7, asl #2
-    add r7, r8, r7, asl #5
+    add         r1, r1, #256          /* r1 = rv = (r1 + 256) >> 9 */
+    mov         r1, r1, asr #9
+    rsb         r2, r2, #128          /* r2 = guv = (-r2 + 128) >> 8 */
+    mov         r2, r2, asr #8       
+    add         r0, r0, #2            /* r0 = bu = (Cb*128 + 256) >> 9 */
+    mov         r0, r0, asr #2       
+    stmia       r8!, {r0-r2}          /* store r0, r1 and r2 to chroma buffer */

-    add r9, r10, r7, asr #8             @ R = (Y >> 8) + rv
-    add r8, r11, r7, asr #7             @ G = (Y >> 7) + guv
-    add r7, r12, r7, asr #8             @ B = (Y >> 8) + bu
+    /* 1st loop, first pixel */
+    ldrb        r5, [r9], #1          /* r5 = *ysrc++ = *Y'_p++ */
+    sub         r5, r5, #16           /* r5 = (Y'-16) * 74 */
+    add         r3, r5, r5, asl #2
+    add         r5, r3, r5, asl #5

-    cmp r9, #31                         @ clamp R
-    mvnhi r9, r9, asr #31
-    andhi r9, r9, #31
+    add         r6, r1, r5, asr #8    /* r6 = r = (Y >> 9) + rv */
+    add         r3, r2, r5, asr #7    /* r3 = g = (Y >> 8) + guv */
+    add         r4, r0, r5, asr #8    /* r4 = b = (Y >> 9) + bu */

-    cmp r8, #63                         @ clamp G
-    mvnhi r8, r8, asr #31
-    andhi r8, r8, #63
+    orr         r5, r6, r4            /* check if clamping is needed... */
+    orr         r5, r5, r3, asr #1    /* ...at all */
+    cmp         r5, #31                 
+    bls         15f                   /* -> no clamp */
+    cmp         r6, #31               /* clamp r */
+    mvnhi       r6, r6, asr #31         
+    andhi       r6, r6, #31             
+    cmp         r3, #63               /* clamp g */
+    mvnhi       r3, r3, asr #31
+    andhi       r3, r3, #63
+    cmp         r4, #31               /* clamp b */
+    mvnhi       r4, r4, asr #31         
+    andhi       r4, r4, #31          
+15:                                   /* no clamp */

-    cmp r7, #31                         @ clamp B
-    mvnhi r7, r7, asr #31
-    andhi r7, r7, #31
+    /* calculate pixel_1 and save to r4 for later pixel packing */
+    orr         r4, r4, r3, lsl #5    /* pixel_1 = r<<11 | g<<5 | b */
+    orr         r4, r4, r6, lsl #11   /* r4 = pixel_1 */

-    orr r6, r7, r8, lsl #5              @ pack pixel
-    orr r6, r6, r9, lsl #11
+    /* 1st loop, second pixel */
+    ldrb        r5, [r9], #1          /* r5 = *ysrc++ = *Y'_p++ */
+    sub         r5, r5, #16           /* r5 = (Y'-16) * 74 */
+    add         r3, r5, r5, asl #2
+    add         r5, r3, r5, asl #5

-    mov r7, r6, lsl #8                  @ swap bytes
-    and r7, r7, #0xff00
-    add r6, r7, r6, lsr #8
+    add         r6, r1, r5, asr #8    /* r6 = r = (Y >> 9) + rv */
+    add         r3, r2, r5, asr #7    /* r3 = g = (Y >> 8) + guv */
+    add         r5, r0, r5, asr #8    /* r5 = b = (Y >> 9) + bu */   

-@ pixel_2
-    ldrb r7, [r0], #1                   @ *ysrc++
-    sub r7, r7, #16                     @ Y = (Y' - 16) * 37
-    add r8, r7, r7, asl #2
-    add r7, r8, r7, asl #5
+    orr         r0, r6, r5            /* check if clamping is needed... */
+    orr         r0, r0, r3, asr #1    /* ...at all */
+    cmp         r0, #31                 
+    bls         15f                   /* -> no clamp */
+    cmp         r6, #31               /* clamp r */
+    mvnhi       r6, r6, asr #31         
+    andhi       r6, r6, #31             
+    cmp         r3, #63               /* clamp g */
+    mvnhi       r3, r3, asr #31
+    andhi       r3, r3, #63
+    cmp         r5, #31               /* clamp b */
+    mvnhi       r5, r5, asr #31         
+    andhi       r5, r5, #31          
+15:                                   /* no clamp */

-    add r9, r10, r7, asr #8             @ R = (Y >> 8) + rv
-    add r8, r11, r7, asr #7             @ G = (Y >> 7) + guv
-    add r7, r12, r7, asr #8             @ B = (Y >> 8) + bu
+    /* calculate pixel_2 and pack with pixel_1 before writing */
+    orr         r5, r5, r3, lsl #5    /* pixel_2 = r<<11 | g<<5 | b */
+    orr         r5, r5, r6, lsl #11   /* r5 = pixel_2 */
+#ifdef FORCE_FIFO_WAIT
+    /* wait for FIFO half full */
+.fifo_wait1:
+    ldr         r3, [lr, #-0xE0]       /* while !(LCD2_BLOCK_CTRL & 0x1000000); */
+    tst         r3, #0x1000000
+    beq         .fifo_wait1
+#endif
+    stmia       lr, {r4,r5}           /* write pixel_1 and pixel_2 */

-    cmp r9, #31                         @ clamp R
-    mvnhi r9, r9, asr #31
-    andhi r9, r9, #31
+    subs        r7, r7, #2            /* check for loop end */
+    bgt         10b                   /* back to beginning  */
+    /* 1st loop end */

-    cmp r8, #63                         @ clamp G
-    mvnhi r8, r8, asr #31
-    andhi r8, r8, #63
+    /* Reload several registers for pointer rewinding for next loop */
+    add         r8, sp, #16           /* chroma buffer */
+    ldmia       sp, { r1, r7, r9}     /* r1  = LCD_BASE */
+                                      /* r7  = loop count */
+                                      /* r9 = &ysrc[stride] */   

-    cmp r7, #31                         @ clamp B
-    mvnhi r7, r7, asr #31
-    andhi r7, r7, #31
+    /* 2nd loop start */
+20:                                   /* loop start */
+    /* restore r0 (bu), r1 (rv) and r2 (guv) from chroma buffer */
+    ldmia       r8!, {r0-r2}

-    orr r7, r7, r8, lsl #5              @ pack pixel
-    orr r7, r7, r9, lsl #11
+    /* 2nd loop, first pixel */
+    ldrb        r5, [r9], #1          /* r5 = *ysrc++ = *Y'_p++ */
+    sub         r5, r5, #16           /* r5 = (Y'-16) * 74 */
+    add         r3, r5, r5, asl #2
+    add         r5, r3, r5, asl #5

-    orr r6, r6, r7, lsl #24             @ swap bytes and add pixels simultaneously
-    mov r7, r7, lsr #8
-    orr r6, r6, r7, lsl #16
-#if 1
-11:                                     @ while (!(LCD2_BLOCK_CTRL & LCD2_BLOCK_TXOK));
-    ldr r11, [r4, #0x20]                @
-    tst r11, #0x1000000                 @
-    beq 11b                             @
+    add         r6, r1, r5, asr #8    /* r6 = r = (Y >> 9) + rv */
+    add         r3, r2, r5, asr #7    /* r3 = g = (Y >> 8) + guv */
+    add         r4, r0, r5, asr #8    /* r4 = b = (Y >> 9) + bu */
+
+    orr         r5, r6, r4            /* check if clamping is needed... */
+    orr         r5, r5, r3, asr #1    /* ...at all */
+    cmp         r5, #31                 
+    bls         15f                   /* -> no clamp */
+    cmp         r6, #31               /* clamp r */
+    mvnhi       r6, r6, asr #31         
+    andhi       r6, r6, #31             
+    cmp         r3, #63               /* clamp g */
+    mvnhi       r3, r3, asr #31
+    andhi       r3, r3, #63
+    cmp         r4, #31               /* clamp b */
+    mvnhi       r4, r4, asr #31         
+    andhi       r4, r4, #31          
+15:                                   /* no clamp */
+    /* calculate pixel_1 and save to r4 for later pixel packing */
+    orr         r4, r4, r3, lsl #5    /* pixel_1 = r<<11 | g<<5 | b */
+    orr         r4, r4, r6, lsl #11   /* r4 = pixel_1 */
+
+    /* 2nd loop, second pixel */
+    ldrb        r5, [r9], #1          /* r5 = *ysrc++ = *Y'_p++ */
+    sub         r5, r5, #16           /* r5 = (Y'-16) * 74 */
+    add         r3, r5, r5, asl #2
+    add         r5, r3, r5, asl #5
+
+    add         r6, r1, r5, asr #8    /* r6 = r = (Y >> 9) + rv */
+    add         r3, r2, r5, asr #7    /* r3 = g = (Y >> 8) + guv */
+    add         r5, r0, r5, asr #8    /* r5 = b = (Y >> 9) + bu */
+
+    orr         r0, r6, r5            /* check if clamping is needed... */
+    orr         r0, r0, r3, asr #1    /* ...at all */
+    cmp         r0, #31                 
+    bls         15f                   /* -> no clamp */
+    cmp         r6, #31               /* clamp r */
+    mvnhi       r6, r6, asr #31         
+    andhi       r6, r6, #31             
+    cmp         r3, #63               /* clamp g */
+    mvnhi       r3, r3, asr #31
+    andhi       r3, r3, #63
+    cmp         r5, #31               /* clamp b */
+    mvnhi       r5, r5, asr #31         
+    andhi       r5, r5, #31          
+15:                                   /* no clamp */
+
+    /* calculate pixel_2 and pack with pixel_1 before writing */
+    orr         r5, r5, r3, lsl #5    /* pixel_2 = r<<11 | g<<5 | b */
+    orr         r5, r5, r6, lsl #11   /* r5 = pixel_2 */
+#ifdef FORCE_FIFO_WAIT
+    /* wait for FIFO half full */
+.fifo_wait2:
+    ldr         r3, [lr, #-0xE0]      /* while !(LCD2_BLOCK_CTRL & 0x1000000); */
+    tst         r3, #0x1000000
+    beq         .fifo_wait2
 #endif
-    str r6, [r5]                        @ send two pixels
+    stmia       lr, {r4,r5}           /* write pixel_1 and pixel_2 */

-    subs r3, r3, #2                     @ decrease width
-    bgt 10b                             @ loop
+    subs        r7, r7, #2            /* check for loop end */
+    bgt         20b                   /* back to beginning  */
+    /* 2nd loop end */

-    ldmpc regs=r4-r11                   @ restore regs
-    .ltorg                              @ dump constant pool
-    .size   lcd_yuv_write_inner_loop, .-lcd_yuv_write_inner_loop
+    ldr         r3, [sp, #12]
+    add         sp, sp, r3            /* deallocate buffer */
+    ldmpc       regs=r4-r10           /* restore registers */
+
+    .ltorg
+    .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
Index: firmware/target/arm/ipod/lcd-color_nano.c
===================================================================
--- firmware/target/arm/ipod/lcd-color_nano.c  (revision 28932)
+++ firmware/target/arm/ipod/lcd-color_nano.c  (working copy)
@@ -120,24 +120,23 @@
 #endif
 }

-/*** update functions ***/
-extern void lcd_yuv_write_inner_loop(unsigned char const * const ysrc,
-                                     unsigned char const * const usrc,
-                                     unsigned char const * const vsrc,
-                                     int width);
+/* Line write helper function for lcd_yuv_blit. Writes two lines of yuv420. */
+extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
+                                   const unsigned int lcd_baseadress,
+                                   int width,
+                                   int stride);

-#define CSUB_X 2
-#define CSUB_Y 2
-
 /* Performance function to blit a YUV bitmap directly to the LCD */
 void lcd_blit_yuv(unsigned char * const src[3],
                   int src_x, int src_y, int stride,
                   int x, int y, int width, int height)
 {
     int h;
-    int y0, x0, y1, x1;
+    int z, y0, x0, y1, x1;
+    unsigned char const * yuv_src[3];

-    width = (width + 1) & ~1;
+    width  = (width  + 1) & ~1;
+    hieght = (height + 1) & ~1;

     /* calculate the drawing region */
 #if CONFIG_LCD == LCD_IPODNANO
@@ -195,24 +194,18 @@
         LCD2_PORT = (LCD2_CMD_MASK|LCD_CNTL_WRITE_TO_GRAM);
     }

-    const int stride_div_csub_x = stride/CSUB_X;
+    z = stride * src_y;
+    yuv_src[0] = src[0] + z + src_x;
+    yuv_src[1] = src[1] + (z >> 2) + (src_x >> 1);
+    yuv_src[2] = src[2] + (yuv_src[1] - src[1]);

-    h=0;
-    while (1)
-    {
-        /* upsampling, YUV->RGB conversion and reduction to RGB565 in one go */
-        const unsigned char *ysrc = src[0] + stride * src_y + src_x;
+    //height >>= 1;

-        const int uvoffset = stride_div_csub_x * (src_y/CSUB_Y) +
-                             (src_x/CSUB_X);
-
-        const unsigned char *usrc = src[1] + uvoffset;
-        const unsigned char *vsrc = src[2] + uvoffset;
-
+    h=0;
+    while (1) {
         int pixels_to_write;

-        if (h==0)
-        {
+        if (h==0) {
             while (!(LCD2_BLOCK_CTRL & LCD2_BLOCK_READY));
             LCD2_BLOCK_CONFIG = 0;

@@ -221,9 +214,7 @@
             pixels_to_write = (width * height) * 2;
             h = height;

-            /* calculate how much we can do in one go */
-            if (pixels_to_write > 0x10000)
-            {
+            if (pixels_to_write > 0x10000) {
                 h = (0x10000/2) / width;
                 pixels_to_write = (width * h) * 2;
             }
@@ -234,10 +225,12 @@
             LCD2_BLOCK_CTRL = 0x34000000;
         }

-        lcd_yuv_write_inner_loop(ysrc,usrc,vsrc,width);
+        lcd_write_yuv420_lines(yuv_src, LCD2_BASE, width, stride);
+        yuv_src[0] += stride << 1;
+        yuv_src[1] += stride >> 1; /* Skip down one chroma line */
+        yuv_src[2] += stride >> 1;

-        src_y++;
-        h--;
+        h-=2;
     }

     while (!(LCD2_BLOCK_CTRL & LCD2_BLOCK_READY));