217 lines
5.1 KiB
ArmAsm
217 lines
5.1 KiB
ArmAsm
|
/* libs/pixelflinger/t32cb16blend.S
|
||
|
**
|
||
|
** Copyright 2006, The Android Open Source Project
|
||
|
** Copyright (c) 2009, The Linux Foundation. All rights reserved.
|
||
|
**
|
||
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
** you may not use this file except in compliance with the License.
|
||
|
** You may obtain a copy of the License at
|
||
|
**
|
||
|
** http://www.apache.org/licenses/LICENSE-2.0
|
||
|
**
|
||
|
** Unless required by applicable law or agreed to in writing, software
|
||
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
** See the License for the specific language governing permissions and
|
||
|
** limitations under the License.
|
||
|
*/
|
||
|
|
||
|
|
||
|
.text
|
||
|
.align
|
||
|
|
||
|
.global scanline_t32cb16blend_arm
|
||
|
|
||
|
|
||
|
/*
|
||
|
* .macro pixel
|
||
|
*
|
||
|
* \DREG is a 32-bit register containing *two* original destination RGB565
|
||
|
* pixels, with the even one in the low-16 bits, and the odd one in the
|
||
|
* high 16 bits.
|
||
|
*
|
||
|
* \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
|
||
|
*
|
||
|
* \FB is a target register that will contain the blended pixel values.
|
||
|
*
|
||
|
* \ODD is either 0 or 1 and indicates if we're blending the lower or
|
||
|
* upper 16-bit pixels in DREG into FB
|
||
|
*
|
||
|
*
|
||
|
* clobbered: r6, r7, lr
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
.macro pixel, DREG, SRC, FB, ODD
|
||
|
|
||
|
// SRC = 0xAABBGGRR
|
||
|
mov r7, \SRC, lsr #24 // sA
|
||
|
add r7, r7, r7, lsr #7 // sA + (sA >> 7)
|
||
|
rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7))
|
||
|
|
||
|
1:
|
||
|
|
||
|
.if \ODD
|
||
|
|
||
|
// red
|
||
|
mov lr, \DREG, lsr #(16 + 11)
|
||
|
smulbb lr, r7, lr
|
||
|
mov r6, \SRC, lsr #3
|
||
|
and r6, r6, #0x1F
|
||
|
add lr, r6, lr, lsr #8
|
||
|
cmp lr, #0x1F
|
||
|
orrhs \FB, \FB, #(0x1F<<(16 + 11))
|
||
|
orrlo \FB, \FB, lr, lsl #(16 + 11)
|
||
|
|
||
|
// green
|
||
|
and r6, \DREG, #(0x3F<<(16 + 5))
|
||
|
smulbt r6, r7, r6
|
||
|
mov lr, \SRC, lsr #(8+2)
|
||
|
and lr, lr, #0x3F
|
||
|
add r6, lr, r6, lsr #(5+8)
|
||
|
cmp r6, #0x3F
|
||
|
orrhs \FB, \FB, #(0x3F<<(16 + 5))
|
||
|
orrlo \FB, \FB, r6, lsl #(16 + 5)
|
||
|
|
||
|
// blue
|
||
|
and lr, \DREG, #(0x1F << 16)
|
||
|
smulbt lr, r7, lr
|
||
|
mov r6, \SRC, lsr #(8+8+3)
|
||
|
and r6, r6, #0x1F
|
||
|
add lr, r6, lr, lsr #8
|
||
|
cmp lr, #0x1F
|
||
|
orrhs \FB, \FB, #(0x1F << 16)
|
||
|
orrlo \FB, \FB, lr, lsl #16
|
||
|
|
||
|
.else
|
||
|
|
||
|
// red
|
||
|
mov lr, \DREG, lsr #11
|
||
|
and lr, lr, #0x1F
|
||
|
smulbb lr, r7, lr
|
||
|
mov r6, \SRC, lsr #3
|
||
|
and r6, r6, #0x1F
|
||
|
add lr, r6, lr, lsr #8
|
||
|
cmp lr, #0x1F
|
||
|
movhs \FB, #(0x1F<<11)
|
||
|
movlo \FB, lr, lsl #11
|
||
|
|
||
|
|
||
|
// green
|
||
|
and r6, \DREG, #(0x3F<<5)
|
||
|
smulbb r6, r7, r6
|
||
|
mov lr, \SRC, lsr #(8+2)
|
||
|
and lr, lr, #0x3F
|
||
|
add r6, lr, r6, lsr #(5+8)
|
||
|
cmp r6, #0x3F
|
||
|
orrhs \FB, \FB, #(0x3F<<5)
|
||
|
orrlo \FB, \FB, r6, lsl #5
|
||
|
|
||
|
// blue
|
||
|
and lr, \DREG, #0x1F
|
||
|
smulbb lr, r7, lr
|
||
|
mov r6, \SRC, lsr #(8+8+3)
|
||
|
and r6, r6, #0x1F
|
||
|
add lr, r6, lr, lsr #8
|
||
|
cmp lr, #0x1F
|
||
|
orrhs \FB, \FB, #0x1F
|
||
|
orrlo \FB, \FB, lr
|
||
|
|
||
|
.endif
|
||
|
|
||
|
.endm
|
||
|
|
||
|
|
||
|
// r0: dst ptr
|
||
|
// r1: src ptr
|
||
|
// r2: count
|
||
|
// r3: d
|
||
|
// r4: s0
|
||
|
// r5: s1
|
||
|
// r6: pixel
|
||
|
// r7: pixel
|
||
|
// r8: free
|
||
|
// r9: free
|
||
|
// r10: free
|
||
|
// r11: free
|
||
|
// r12: scratch
|
||
|
// r14: pixel
|
||
|
|
||
|
scanline_t32cb16blend_arm:
|
||
|
stmfd sp!, {r4-r7, lr}
|
||
|
|
||
|
pld [r0]
|
||
|
pld [r1]
|
||
|
|
||
|
// align DST to 32 bits
|
||
|
tst r0, #0x3
|
||
|
beq aligned
|
||
|
subs r2, r2, #1
|
||
|
ldmlofd sp!, {r4-r7, lr} // return
|
||
|
bxlo lr
|
||
|
|
||
|
last:
|
||
|
ldr r4, [r1], #4
|
||
|
ldrh r3, [r0]
|
||
|
pixel r3, r4, r12, 0
|
||
|
strh r12, [r0], #2
|
||
|
|
||
|
aligned:
|
||
|
subs r2, r2, #2
|
||
|
blo 9f
|
||
|
|
||
|
// The main loop is unrolled twice and processes 4 pixels
|
||
|
8: ldmia r1!, {r4, r5}
|
||
|
// stream the source
|
||
|
pld [r1, #32]
|
||
|
add r0, r0, #4
|
||
|
// it's all zero, skip this pixel
|
||
|
orrs r3, r4, r5
|
||
|
beq 7f
|
||
|
|
||
|
// load the destination
|
||
|
ldr r3, [r0, #-4]
|
||
|
// stream the destination
|
||
|
pld [r0, #32]
|
||
|
pixel r3, r4, r12, 0
|
||
|
pixel r3, r5, r12, 1
|
||
|
// effectively, we're getting write-combining by virtue of the
|
||
|
// cpu's write-back cache.
|
||
|
str r12, [r0, #-4]
|
||
|
|
||
|
// 2nd iterration of the loop, don't stream anything
|
||
|
subs r2, r2, #2
|
||
|
movlt r4, r5
|
||
|
blt 9f
|
||
|
ldmia r1!, {r4, r5}
|
||
|
add r0, r0, #4
|
||
|
orrs r3, r4, r5
|
||
|
beq 7f
|
||
|
ldr r3, [r0, #-4]
|
||
|
pixel r3, r4, r12, 0
|
||
|
pixel r3, r5, r12, 16
|
||
|
str r12, [r0, #-4]
|
||
|
|
||
|
// 3rd iteration of the loop, don't stream anything
|
||
|
subs r2, r2, #2
|
||
|
movlt r4, r5
|
||
|
blt 9f
|
||
|
ldmia r1!, {r4, r5}
|
||
|
add r0, r0, #4
|
||
|
orrs r3, r4, r5
|
||
|
beq 7f
|
||
|
ldr r3, [r0, #-4]
|
||
|
pixel r3, r4, r12, 0
|
||
|
pixel r3, r5, r12, 16
|
||
|
str r12, [r0, #-4]
|
||
|
|
||
|
7: subs r2, r2, #2
|
||
|
movlt r4, r5
|
||
|
blo 9f
|
||
|
b 8b
|
||
|
|
||
|
9: adds r2, r2, #1
|
||
|
bhs last
|
||
|
ldmfd sp!, {r4-r7, lr} // return
|
||
|
bx lr
|