M7350v1_en_gpl

2024-09-09 08:52:07 +00:00
commit f9cc65cfda
65988 changed files with 26357421 additions and 0 deletions
@@ -0,0 +1,438 @@
+/* libs/pixelflinger/codeflinger/ARMAssembler.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#define LOG_TAG "ARMAssembler"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cutils/log.h>
+#include <cutils/properties.h>
+
+#if defined(WITH_LIB_HARDWARE)
+#include <hardware_legacy/qemu_tracing.h>
+#endif
+
+#include <private/pixelflinger/ggl_context.h>
+
+#include "codeflinger/ARMAssembler.h"
+#include "codeflinger/CodeCache.h"
+#include "codeflinger/disassem.h"
+
+// ----------------------------------------------------------------------------
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark ARMAssembler...
+#endif
+
+ARMAssembler::ARMAssembler(const sp<Assembly>& assembly)
+    :   ARMAssemblerInterface(),
+        mAssembly(assembly)
+{
+    mBase = mPC = (uint32_t *)assembly->base();
+    mDuration = ggl_system_time();
+#if defined(WITH_LIB_HARDWARE)
+    mQemuTracing = true;
+#endif
+}
+
+ARMAssembler::~ARMAssembler()
+{
+}
+
+uint32_t* ARMAssembler::pc() const
+{
+    return mPC;
+}
+
+uint32_t* ARMAssembler::base() const
+{
+    return mBase;
+}
+
+void ARMAssembler::reset()
+{
+    mBase = mPC = (uint32_t *)mAssembly->base();
+    mBranchTargets.clear();
+    mLabels.clear();
+    mLabelsInverseMapping.clear();
+    mComments.clear();
+}
+
+// ----------------------------------------------------------------------------
+
+void ARMAssembler::disassemble(const char* name)
+{
+    if (name) {
+        printf("%s:\n", name);
+    }
+    size_t count = pc()-base();
+    uint32_t* i = base();
+    while (count--) {
+        ssize_t label = mLabelsInverseMapping.indexOfKey(i);
+        if (label >= 0) {
+            printf("%s:\n", mLabelsInverseMapping.valueAt(label));
+        }
+        ssize_t comment = mComments.indexOfKey(i);
+        if (comment >= 0) {
+            printf("; %s\n", mComments.valueAt(comment));
+        }
+        printf("%08x:    %08x    ", int(i), int(i[0]));
+        ::disassemble((u_int)i);
+        i++;
+    }
+}
+
+void ARMAssembler::comment(const char* string)
+{
+    mComments.add(mPC, string);
+}
+
+void ARMAssembler::label(const char* theLabel)
+{
+    mLabels.add(theLabel, mPC);
+    mLabelsInverseMapping.add(mPC, theLabel);
+}
+
+void ARMAssembler::B(int cc, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (cc<<28) | (0xA<<24) | 0;
+}
+
+void ARMAssembler::BL(int cc, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (cc<<28) | (0xB<<24) | 0;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Prolog/Epilog & Generate...
+#endif
+
+
+void ARMAssembler::prolog()
+{
+    // write dummy prolog code
+    mPrologPC = mPC;
+    STM(AL, FD, SP, 1, LSAVED);
+}
+
+void ARMAssembler::epilog(uint32_t touched)
+{
+    touched &= LSAVED;
+    if (touched) {
+        // write prolog code
+        uint32_t* pc = mPC;
+        mPC = mPrologPC;
+        STM(AL, FD, SP, 1, touched | LLR);
+        mPC = pc;
+        // write epilog code
+        LDM(AL, FD, SP, 1, touched | LLR);
+        BX(AL, LR);
+    } else {   // heh, no registers to save!
+        // write prolog code
+        uint32_t* pc = mPC;
+        mPC = mPrologPC;
+        MOV(AL, 0, R0, R0); // NOP
+        mPC = pc;
+        // write epilog code
+        BX(AL, LR);
+    }
+}
+
+int ARMAssembler::generate(const char* name)
+{
+    // fixup all the branches
+    size_t count = mBranchTargets.size();
+    while (count--) {
+        const branch_target_t& bt = mBranchTargets[count];
+        uint32_t* target_pc = mLabels.valueFor(bt.label);
+        LOG_ALWAYS_FATAL_IF(!target_pc,
+                "error resolving branch targets, target_pc is null");
+        int32_t offset = int32_t(target_pc - (bt.pc+2));
+        *bt.pc |= offset & 0xFFFFFF;
+    }
+
+    mAssembly->resize( int(pc()-base())*4 );
+    
+    // the instruction cache is flushed by CodeCache
+    const int64_t duration = ggl_system_time() - mDuration;
+    const char * const format = "generated %s (%d ins) at [%p:%p] in %lld ns\n";
+    LOGI(format, name, int(pc()-base()), base(), pc(), duration);
+
+#if defined(WITH_LIB_HARDWARE)
+    if (__builtin_expect(mQemuTracing, 0)) {
+        int err = qemu_add_mapping(int(base()), name);
+        mQemuTracing = (err >= 0);
+    }
+#endif
+
+    char value[PROPERTY_VALUE_MAX];
+    property_get("debug.pf.disasm", value, "0");
+    if (atoi(value) != 0) {
+        printf(format, name, int(pc()-base()), base(), pc(), duration);
+        disassemble(name);
+    }
+    
+    return NO_ERROR;
+}
+
+uint32_t* ARMAssembler::pcForLabel(const char* label)
+{
+    return mLabels.valueFor(label);
+}
+
+// ----------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Data Processing...
+#endif
+
+void ARMAssembler::dataProcessing(int opcode, int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    *mPC++ = (cc<<28) | (opcode<<21) | (s<<20) | (Rn<<16) | (Rd<<12) | Op2;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Multiply...
+#endif
+
+// multiply...
+void ARMAssembler::MLA(int cc, int s,
+        int Rd, int Rm, int Rs, int Rn) {
+    if (Rd == Rm) { int t = Rm; Rm=Rs; Rs=t; } 
+    LOG_FATAL_IF(Rd==Rm, "MLA(r%u,r%u,r%u,r%u)", Rd,Rm,Rs,Rn);
+    *mPC++ =    (cc<<28) | (1<<21) | (s<<20) |
+                (Rd<<16) | (Rn<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::MUL(int cc, int s,
+        int Rd, int Rm, int Rs) {
+    if (Rd == Rm) { int t = Rm; Rm=Rs; Rs=t; } 
+    LOG_FATAL_IF(Rd==Rm, "MUL(r%u,r%u,r%u)", Rd,Rm,Rs);
+    *mPC++ = (cc<<28) | (s<<20) | (Rd<<16) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::UMULL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::UMUAL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<21) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::SMULL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::SMUAL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (1<<21) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Branches...
+#endif
+
+// branches...
+void ARMAssembler::B(int cc, uint32_t* pc)
+{
+    int32_t offset = int32_t(pc - (mPC+2));
+    *mPC++ = (cc<<28) | (0xA<<24) | (offset & 0xFFFFFF);
+}
+
+void ARMAssembler::BL(int cc, uint32_t* pc)
+{
+    int32_t offset = int32_t(pc - (mPC+2));
+    *mPC++ = (cc<<28) | (0xB<<24) | (offset & 0xFFFFFF);
+}
+
+void ARMAssembler::BX(int cc, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x12FFF10 | Rn;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Data Transfer...
+#endif
+
+// data transfert...
+void ARMAssembler::LDR(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<20) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::LDRB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<22) | (1<<20) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::STR(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::STRB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<22) | (Rn<<16) | (Rd<<12) | offset;
+}
+
+void ARMAssembler::LDRH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xB0 | offset;
+}
+void ARMAssembler::LDRSB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xD0 | offset;
+}
+void ARMAssembler::LDRSH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xF0 | offset;
+}
+void ARMAssembler::STRH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (Rn<<16) | (Rd<<12) | 0xB0 | offset;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Block Data Transfer...
+#endif
+
+// block data transfer...
+void ARMAssembler::LDM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{   //                    ED FD EA FA      IB IA DB DA
+    const uint8_t P[8] = { 1, 0, 1, 0,      1, 0, 1, 0 };
+    const uint8_t U[8] = { 1, 1, 0, 0,      1, 1, 0, 0 };
+    *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+            (uint32_t(U[dir])<<23) | (1<<20) | (W<<21) | (Rn<<16) | reg_list;
+}
+
+void ARMAssembler::STM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{   //                    FA EA FD ED      IB IA DB DA
+    const uint8_t P[8] = { 0, 1, 0, 1,      1, 0, 1, 0 };
+    const uint8_t U[8] = { 0, 0, 1, 1,      1, 1, 0, 0 };
+    *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+            (uint32_t(U[dir])<<23) | (0<<20) | (W<<21) | (Rn<<16) | reg_list;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Special...
+#endif
+
+// special...
+void ARMAssembler::SWP(int cc, int Rn, int Rd, int Rm) {
+    *mPC++ = (cc<<28) | (2<<23) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+}
+void ARMAssembler::SWPB(int cc, int Rn, int Rd, int Rm) {
+    *mPC++ = (cc<<28) | (2<<23) | (1<<22) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+}
+void ARMAssembler::SWI(int cc, uint32_t comment) {
+    *mPC++ = (cc<<28) | (0xF<<24) | comment;
+}
+
+#if 0
+#pragma mark -
+#pragma mark DSP instructions...
+#endif
+
+// DSP instructions...
+void ARMAssembler::PLD(int Rn, uint32_t offset) {
+    LOG_ALWAYS_FATAL_IF(!((offset&(1<<24)) && !(offset&(1<<21))),
+                        "PLD only P=1, W=0");
+    *mPC++ = 0xF550F000 | (Rn<<16) | offset;
+}
+
+void ARMAssembler::CLZ(int cc, int Rd, int Rm)
+{
+    *mPC++ = (cc<<28) | 0x16F0F10| (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QADD(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1000050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QDADD(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1400050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QSUB(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1200050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QDSUB(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1600050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs)
+{
+    *mPC++ = (cc<<28) | 0x1600080 | (Rd<<16) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMULW(int cc, int y,
+                int Rd, int Rm, int Rs)
+{
+    *mPC++ = (cc<<28) | 0x12000A0 | (Rd<<16) | (Rs<<8) | (y<<4) | Rm;
+}
+
+void ARMAssembler::SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1000080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm)
+{
+    *mPC++ = (cc<<28) | 0x1400080 | (RdHi<<16) | (RdLo<<12) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1200080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (y<<4) | Rm;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Byte/half word extract and extend (ARMv6+ only)...
+#endif
+
+void ARMAssembler::UXTB16(int cc, int Rd, int Rm, int rotate)
+{
+    *mPC++ = (cc<<28) | 0x6CF0070 | (Rd<<12) | ((rotate >> 3) << 10) | Rm;
+}
+
+}; // namespace android
+
@@ -0,0 +1,157 @@
+/* libs/pixelflinger/codeflinger/ARMAssembler.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#ifndef ANDROID_ARMASSEMBLER_H
+#define ANDROID_ARMASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "tinyutils/Vector.h"
+#include "tinyutils/KeyedVector.h"
+#include "tinyutils/smartpointer.h"
+
+#include "tinyutils/smartpointer.h"
+#include "codeflinger/ARMAssemblerInterface.h"
+#include "codeflinger/CodeCache.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssembler : public ARMAssemblerInterface
+{
+public:
+                ARMAssembler(const sp<Assembly>& assembly);
+    virtual     ~ARMAssembler();
+
+    uint32_t*   base() const;
+    uint32_t*   pc() const;
+
+
+    void        disassemble(const char* name);
+
+    // ------------------------------------------------------------------------
+    // ARMAssemblerInterface...
+    // ------------------------------------------------------------------------
+
+    virtual void    reset();
+
+    virtual int     generate(const char* name);
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    virtual uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+
+private:
+                ARMAssembler(const ARMAssembler& rhs);
+                ARMAssembler& operator = (const ARMAssembler& rhs);
+
+    sp<Assembly>    mAssembly;
+    uint32_t*       mBase;
+    uint32_t*       mPC;
+    uint32_t*       mPrologPC;
+    int64_t         mDuration;
+#if defined(WITH_LIB_HARDWARE)
+    bool            mQemuTracing;
+#endif
+    
+    struct branch_target_t {
+        inline branch_target_t() : label(0), pc(0) { }
+        inline branch_target_t(const char* l, uint32_t* p)
+            : label(l), pc(p) { }
+        const char* label;
+        uint32_t*   pc;
+    };
+    
+    Vector<branch_target_t>                 mBranchTargets;
+    KeyedVector< const char*, uint32_t* >   mLabels;
+    KeyedVector< uint32_t*, const char* >   mLabelsInverseMapping;
+    KeyedVector< uint32_t*, const char* >   mComments;
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_H
@@ -0,0 +1,173 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerInterface.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <cutils/log.h>
+#include "codeflinger/ARMAssemblerInterface.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+ARMAssemblerInterface::~ARMAssemblerInterface()
+{
+}
+
+int ARMAssemblerInterface::buildImmediate(
+        uint32_t immediate, uint32_t& rot, uint32_t& imm)
+{
+    rot = 0;
+    imm = immediate;
+    if (imm > 0x7F) { // skip the easy cases
+        while (!(imm&3)  || (imm&0xFC000000)) {
+            uint32_t newval;
+            newval = imm >> 2;
+            newval |= (imm&3) << 30;
+            imm = newval;
+            rot += 2;
+            if (rot == 32) {
+                rot = 0;
+                break;
+            }
+        }
+    }
+    rot = (16 - (rot>>1)) & 0xF;
+
+    if (imm>=0x100)
+        return -EINVAL;
+
+    if (((imm>>(rot<<1)) | (imm<<(32-(rot<<1)))) != immediate)
+        return -1;
+
+    return 0;
+}
+
+// shifters...
+
+bool ARMAssemblerInterface::isValidImmediate(uint32_t immediate)
+{
+    uint32_t rot, imm;
+    return buildImmediate(immediate, rot, imm) == 0;
+}
+
+uint32_t ARMAssemblerInterface::imm(uint32_t immediate)
+{
+    uint32_t rot, imm;
+    int err = buildImmediate(immediate, rot, imm);
+
+    LOG_ALWAYS_FATAL_IF(err==-EINVAL,
+                        "immediate %08x cannot be encoded",
+                        immediate);
+
+    LOG_ALWAYS_FATAL_IF(err,
+                        "immediate (%08x) encoding bogus!",
+                        immediate);
+
+    return (1<<25) | (rot<<8) | imm;
+}
+
+uint32_t ARMAssemblerInterface::reg_imm(int Rm, int type, uint32_t shift)
+{
+    return ((shift&0x1F)<<7) | ((type&0x3)<<5) | (Rm&0xF);
+}
+
+uint32_t ARMAssemblerInterface::reg_rrx(int Rm)
+{
+    return (ROR<<5) | (Rm&0xF);
+}
+
+uint32_t ARMAssemblerInterface::reg_reg(int Rm, int type, int Rs)
+{
+    return ((Rs&0xF)<<8) | ((type&0x3)<<5) | (1<<4) | (Rm&0xF);
+}
+
+// addressing modes... 
+// LDR(B)/STR(B)/PLD (immediate and Rm can be negative, which indicate U=0)
+uint32_t ARMAssemblerInterface::immed12_pre(int32_t immed12, int W)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+    return (1<<24) | (((uint32_t(immed12)>>31)^1)<<23) |
+            ((W&1)<<21) | (abs(immed12)&0x7FF);
+}
+
+uint32_t ARMAssemblerInterface::immed12_post(int32_t immed12)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+
+    return (((uint32_t(immed12)>>31)^1)<<23) | (abs(immed12)&0x7FF);
+}
+
+uint32_t ARMAssemblerInterface::reg_scale_pre(int Rm, int type, 
+        uint32_t shift, int W)
+{
+    return  (1<<25) | (1<<24) | 
+            (((uint32_t(Rm)>>31)^1)<<23) | ((W&1)<<21) |
+            reg_imm(abs(Rm), type, shift);
+}
+
+uint32_t ARMAssemblerInterface::reg_scale_post(int Rm, int type, uint32_t shift)
+{
+    return (1<<25) | (((uint32_t(Rm)>>31)^1)<<23) | reg_imm(abs(Rm), type, shift);
+}
+
+// LDRH/LDRSB/LDRSH/STRH (immediate and Rm can be negative, which indicate U=0)
+uint32_t ARMAssemblerInterface::immed8_pre(int32_t immed8, int W)
+{
+    uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+
+    return  (1<<24) | (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) |
+            ((W&1)<<21) | (((offset&0xF0)<<4)|(offset&0xF));
+}
+
+uint32_t ARMAssemblerInterface::immed8_post(int32_t immed8)
+{
+    uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+
+    return (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) |
+            (((offset&0xF0)<<4) | (offset&0xF));
+}
+
+uint32_t ARMAssemblerInterface::reg_pre(int Rm, int W)
+{
+    return (1<<24) | (((uint32_t(Rm)>>31)^1)<<23) | ((W&1)<<21) | (abs(Rm)&0xF);
+}
+
+uint32_t ARMAssemblerInterface::reg_post(int Rm)
+{
+    return (((uint32_t(Rm)>>31)^1)<<23) | (abs(Rm)&0xF);
+}
+
+
+}; // namespace android
+
@@ -0,0 +1,327 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerInterface.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_ARMASSEMBLER_INTERFACE_H
+#define ANDROID_ARMASSEMBLER_INTERFACE_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssemblerInterface
+{
+public:
+    virtual ~ARMAssemblerInterface();
+
+    enum {
+        EQ, NE, CS, CC, MI, PL, VS, VC, HI, LS, GE, LT, GT, LE, AL, NV,
+        HS = CS,
+        LO = CC
+    };
+    enum {
+        S = 1
+    };
+    enum {
+        LSL, LSR, ASR, ROR
+    };
+    enum {
+        ED, FD, EA, FA,
+        IB, IA, DB, DA
+    };
+    enum {
+        R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15,
+        SP = R13,
+        LR = R14,
+        PC = R15
+    };
+    enum {
+        #define LIST(rr) L##rr=1<<rr
+        LIST(R0), LIST(R1), LIST(R2), LIST(R3), LIST(R4), LIST(R5), LIST(R6),
+        LIST(R7), LIST(R8), LIST(R9), LIST(R10), LIST(R11), LIST(R12),
+        LIST(R13), LIST(R14), LIST(R15),
+        LIST(SP), LIST(LR), LIST(PC),
+        #undef LIST
+        LSAVED = LR4|LR5|LR6|LR7|LR8|LR9|LR10|LR11 | LLR
+    };
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // shifters...
+    static bool        isValidImmediate(uint32_t immed);
+    static int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    static uint32_t    imm(uint32_t immediate);
+    static uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    static uint32_t    reg_rrx(int Rm);
+    static uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes... 
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    static uint32_t    immed12_pre(int32_t immed12, int W=0);
+    static uint32_t    immed12_post(int32_t immed12);
+    static uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    static uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    static uint32_t    immed8_pre(int32_t immed8, int W=0);
+    static uint32_t    immed8_post(int32_t immed8);
+    static uint32_t    reg_pre(int Rm, int W=0);
+    static uint32_t    reg_post(int Rm);
+
+    // -----------------------------------------------------------------------
+    // basic instructions & code generation
+    // -----------------------------------------------------------------------
+
+    // generate the code
+    virtual void reset() = 0;
+    virtual int  generate(const char* name) = 0;
+    virtual void disassemble(const char* name) = 0;
+    
+    // construct prolog and epilog
+    virtual void prolog() = 0;
+    virtual void epilog(uint32_t touched) = 0;
+    virtual void comment(const char* string) = 0;
+
+    // data processing...
+    enum {
+        opAND, opEOR, opSUB, opRSB, opADD, opADC, opSBC, opRSC, 
+        opTST, opTEQ, opCMP, opCMN, opORR, opMOV, opBIC, opMVN
+    };
+
+    virtual void
+            dataProcessing( int opcode, int cc, int s,
+                            int Rd, int Rn,
+                            uint32_t Op2) = 0;
+    
+    // multiply...
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+
+    // branches...
+    virtual void B(int cc, uint32_t* pc) = 0;
+    virtual void BL(int cc, uint32_t* pc) = 0;
+    virtual void BX(int cc, int Rn) = 0;
+
+    virtual void label(const char* theLabel) = 0;
+    virtual void B(int cc, const char* label) = 0;
+    virtual void BL(int cc, const char* label) = 0;
+
+    // valid only after generate() has been called
+    virtual uint32_t* pcForLabel(const char* label) = 0;
+
+    // data transfer...
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0)) = 0;
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0)) = 0;
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0)) = 0;
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0)) = 0;
+
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0)) = 0;
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = immed8_pre(0)) = 0;
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0)) = 0;
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0)) = 0;
+
+    // block data transfer...
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list) = 0;
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list) = 0;
+
+    // special...
+    virtual void SWP(int cc, int Rn, int Rd, int Rm) = 0;
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm) = 0;
+    virtual void SWI(int cc, uint32_t comment) = 0;
+
+    // DSP instructions...
+    enum {
+        // B=0, T=1
+        //     yx
+        xyBB = 0, // 0000,
+        xyTB = 2, // 0010,
+        xyBT = 4, // 0100,
+        xyTT = 6, // 0110,
+        yB   = 0, // 0000,
+        yT   = 4, // 0100
+    };
+
+    virtual void PLD(int Rn, uint32_t offset) = 0;
+
+    virtual void CLZ(int cc, int Rd, int Rm) = 0;
+    
+    virtual void QADD(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn) = 0;
+    
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm) = 0;
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+
+    // byte/half word extract...
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate) = 0;
+
+    // -----------------------------------------------------------------------
+    // convenience...
+    // -----------------------------------------------------------------------
+    inline void
+    ADC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opADC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    ADD(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opADD, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    AND(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opAND, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    BIC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opBIC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    EOR(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opEOR, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    MOV(int cc, int s, int Rd, uint32_t Op2) {
+        dataProcessing(opMOV, cc, s, Rd, 0, Op2);
+    }
+    inline void
+    MVN(int cc, int s, int Rd, uint32_t Op2) {
+        dataProcessing(opMVN, cc, s, Rd, 0, Op2);
+    }
+    inline void
+    ORR(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opORR, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    RSB(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opRSB, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    RSC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opRSC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    SBC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opSBC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    SUB(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opSUB, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    TEQ(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opTEQ, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    TST(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opTST, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    CMP(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opCMP, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    CMN(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opCMN, cc, 1, 0, Rn, Op2);
+    }
+
+    inline void SMULBB(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyBB, Rd, Rm, Rs);    }
+    inline void SMULTB(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyTB, Rd, Rm, Rs);    }
+    inline void SMULBT(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyBT, Rd, Rm, Rs);    }
+    inline void SMULTT(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyTT, Rd, Rm, Rs);    }
+
+    inline void SMULWB(int cc, int Rd, int Rm, int Rs) {
+        SMULW(cc, yB, Rd, Rm, Rs);    }
+    inline void SMULWT(int cc, int Rd, int Rm, int Rs) {
+        SMULW(cc, yT, Rd, Rm, Rs);    }
+
+    inline void
+    SMLABB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyBB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLATB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyTB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLABT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyBT, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLATT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyTT, Rd, Rm, Rs, Rn);    }
+
+    inline void
+    SMLALBB(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyBB, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALTB(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyTB, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALBT(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyBT, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALTT(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyTT, RdHi, RdLo, Rs, Rm);    }
+
+    inline void
+    SMLAWB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLAW(cc, yB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLAWT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLAW(cc, yT, Rd, Rm, Rs, Rn);    }
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_INTERFACE_H
@@ -0,0 +1,203 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerProxy.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "codeflinger/ARMAssemblerProxy.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+ARMAssemblerProxy::ARMAssemblerProxy()
+    : mTarget(0)
+{
+}
+
+ARMAssemblerProxy::ARMAssemblerProxy(ARMAssemblerInterface* target)
+    : mTarget(target)
+{
+}
+
+ARMAssemblerProxy::~ARMAssemblerProxy()
+{
+    delete mTarget;
+}
+
+void ARMAssemblerProxy::setTarget(ARMAssemblerInterface* target)
+{
+    delete mTarget;
+    mTarget = target;
+}
+
+void ARMAssemblerProxy::reset() {
+    mTarget->reset();
+}
+int ARMAssemblerProxy::generate(const char* name) {
+    return mTarget->generate(name);
+}
+void ARMAssemblerProxy::disassemble(const char* name) {
+    return mTarget->disassemble(name);
+}
+void ARMAssemblerProxy::prolog() {
+    mTarget->prolog();
+}
+void ARMAssemblerProxy::epilog(uint32_t touched) {
+    mTarget->epilog(touched);
+}
+void ARMAssemblerProxy::comment(const char* string) {
+    mTarget->comment(string);
+}
+
+
+void ARMAssemblerProxy::dataProcessing( int opcode, int cc, int s,
+                                        int Rd, int Rn, uint32_t Op2)
+{
+    mTarget->dataProcessing(opcode, cc, s, Rd, Rn, Op2);
+}
+
+void ARMAssemblerProxy::MLA(int cc, int s, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->MLA(cc, s, Rd, Rm, Rs, Rn);
+}
+void ARMAssemblerProxy::MUL(int cc, int s, int Rd, int Rm, int Rs) {
+    mTarget->MUL(cc, s, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::UMULL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->UMULL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::UMUAL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->UMUAL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::SMULL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->SMULL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::SMUAL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->SMUAL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+
+void ARMAssemblerProxy::B(int cc, uint32_t* pc) {
+    mTarget->B(cc, pc); 
+}
+void ARMAssemblerProxy::BL(int cc, uint32_t* pc) {
+    mTarget->BL(cc, pc); 
+}
+void ARMAssemblerProxy::BX(int cc, int Rn) {
+    mTarget->BX(cc, Rn); 
+}
+void ARMAssemblerProxy::label(const char* theLabel) {
+    mTarget->label(theLabel);
+}
+void ARMAssemblerProxy::B(int cc, const char* label) {
+    mTarget->B(cc, label);
+}
+void ARMAssemblerProxy::BL(int cc, const char* label) {
+    mTarget->BL(cc, label);
+}
+
+uint32_t* ARMAssemblerProxy::pcForLabel(const char* label) {
+    return mTarget->pcForLabel(label);
+}
+
+void ARMAssemblerProxy::LDR(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STR(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STRB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STRB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRSB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRSB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRSH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRSH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STRH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STRH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDM(int cc, int dir, int Rn, int W, uint32_t reg_list) {
+    mTarget->LDM(cc, dir, Rn, W, reg_list);
+}
+void ARMAssemblerProxy::STM(int cc, int dir, int Rn, int W, uint32_t reg_list) {
+    mTarget->STM(cc, dir, Rn, W, reg_list);
+}
+
+void ARMAssemblerProxy::SWP(int cc, int Rn, int Rd, int Rm) {
+    mTarget->SWP(cc, Rn, Rd, Rm);
+}
+void ARMAssemblerProxy::SWPB(int cc, int Rn, int Rd, int Rm) {
+    mTarget->SWPB(cc, Rn, Rd, Rm);
+}
+void ARMAssemblerProxy::SWI(int cc, uint32_t comment) {
+    mTarget->SWI(cc, comment);
+}
+
+
+void ARMAssemblerProxy::PLD(int Rn, uint32_t offset) {
+    mTarget->PLD(Rn, offset);
+}
+void ARMAssemblerProxy::CLZ(int cc, int Rd, int Rm) {
+    mTarget->CLZ(cc, Rd, Rm);
+}
+void ARMAssemblerProxy::QADD(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QADD(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QDADD(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QDADD(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QSUB(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QSUB(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QDSUB(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QDSUB(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::SMUL(int cc, int xy, int Rd, int Rm, int Rs) {
+    mTarget->SMUL(cc, xy, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::SMULW(int cc, int y, int Rd, int Rm, int Rs) {
+    mTarget->SMULW(cc, y, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::SMLA(int cc, int xy, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->SMLA(cc, xy, Rd, Rm, Rs, Rn);
+}
+void ARMAssemblerProxy::SMLAL(  int cc, int xy,
+                                int RdHi, int RdLo, int Rs, int Rm) {
+    mTarget->SMLAL(cc, xy, RdHi, RdLo, Rs, Rm);
+}
+void ARMAssemblerProxy::SMLAW(int cc, int y, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->SMLAW(cc, y, Rd, Rm, Rs, Rn);
+}
+
+void ARMAssemblerProxy::UXTB16(int cc, int Rd, int Rm, int rotate) {
+    mTarget->UXTB16(cc, Rd, Rm, rotate);
+}
+
+}; // namespace android
+
@@ -0,0 +1,125 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerProxy.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_ARMASSEMBLER_PROXY_H
+#define ANDROID_ARMASSEMBLER_PROXY_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "codeflinger/ARMAssemblerInterface.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssemblerProxy : public ARMAssemblerInterface
+{
+public:
+    // ARMAssemblerProxy take ownership of the target
+
+                ARMAssemblerProxy();
+                ARMAssemblerProxy(ARMAssemblerInterface* target);
+    virtual     ~ARMAssemblerProxy();
+
+    void setTarget(ARMAssemblerInterface* target);
+
+    virtual void    reset();
+    virtual int     generate(const char* name);
+    virtual void    disassemble(const char* name);
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = immed12_pre(0));
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = immed8_pre(0));
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+
+private:
+    ARMAssemblerInterface*  mTarget;
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_PROXY_H
@@ -0,0 +1,173 @@
+/* libs/pixelflinger/codeflinger/CodeCache.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include <cutils/log.h>
+#include <cutils/atomic.h>
+
+#include "codeflinger/CodeCache.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+#if defined(__arm__)
+#include <unistd.h>
+#include <errno.h>
+#endif
+
+// ----------------------------------------------------------------------------
+
+Assembly::Assembly(size_t size)
+    : mCount(1), mSize(0)
+{
+    mBase = (uint32_t*)mspace_malloc(getMspace(), size);
+    mSize = size;
+    ensureMbaseExecutable();
+}
+
+Assembly::~Assembly()
+{
+    mspace_free(getMspace(), mBase);
+}
+
+void Assembly::incStrong(const void*) const
+{
+    android_atomic_inc(&mCount);
+}
+
+void Assembly::decStrong(const void*) const
+{
+    if (android_atomic_dec(&mCount) == 1) {
+        delete this;
+    }
+}
+
+ssize_t Assembly::size() const
+{
+    if (!mBase) return NO_MEMORY;
+    return mSize;
+}
+
+uint32_t* Assembly::base() const
+{
+    return mBase;
+}
+
+ssize_t Assembly::resize(size_t newSize)
+{
+    mBase = (uint32_t*)mspace_realloc(getMspace(), mBase, newSize);
+    mSize = newSize;
+    ensureMbaseExecutable();
+    return size();
+}
+
+mspace Assembly::getMspace()
+{
+    static mspace msp = create_contiguous_mspace(2 * 1024, 1024 * 1024, /*locked=*/ false);
+    return msp;
+}
+
+void Assembly::ensureMbaseExecutable()
+{
+    long pagesize = sysconf(_SC_PAGESIZE);
+    long pagemask = ~(pagesize - 1);  // assumes pagesize is a power of 2
+
+    uint32_t* pageStart = (uint32_t*) (((uintptr_t) mBase) & pagemask);
+    size_t adjustedLength = (mBase - pageStart) * sizeof(uint32_t) + mSize;
+
+    if (mBase && mprotect(pageStart, adjustedLength, PROT_READ | PROT_WRITE | PROT_EXEC) != 0) {
+        mspace_free(getMspace(), mBase);
+        mBase = NULL;
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+CodeCache::CodeCache(size_t size)
+    : mCacheSize(size), mCacheInUse(0)
+{
+    pthread_mutex_init(&mLock, 0);
+}
+
+CodeCache::~CodeCache()
+{
+    pthread_mutex_destroy(&mLock);
+}
+
+sp<Assembly> CodeCache::lookup(const AssemblyKeyBase& keyBase) const
+{
+    pthread_mutex_lock(&mLock);
+    sp<Assembly> r;
+    ssize_t index = mCacheData.indexOfKey(key_t(keyBase));
+    if (index >= 0) {
+        const cache_entry_t& e = mCacheData.valueAt(index);
+        e.when = mWhen++;
+        r = e.entry;
+    }
+    pthread_mutex_unlock(&mLock);
+    return r;
+}
+
+int CodeCache::cache(  const AssemblyKeyBase& keyBase,
+                            const sp<Assembly>& assembly)
+{
+    pthread_mutex_lock(&mLock);
+
+    const ssize_t assemblySize = assembly->size();
+    while (mCacheInUse + assemblySize > mCacheSize) {
+        // evict the LRU
+        size_t lru = 0;
+        size_t count = mCacheData.size();
+        for (size_t i=0 ; i<count ; i++) {
+            const cache_entry_t& e = mCacheData.valueAt(i);
+            if (e.when < mCacheData.valueAt(lru).when) {
+                lru = i;
+            }
+        }
+        const cache_entry_t& e = mCacheData.valueAt(lru);
+        mCacheInUse -= e.entry->size();
+        mCacheData.removeItemsAt(lru);
+    }
+
+    ssize_t err = mCacheData.add(key_t(keyBase), cache_entry_t(assembly, mWhen));
+    if (err >= 0) {
+        mCacheInUse += assemblySize;
+        mWhen++;
+        // synchronize caches...
+#if defined(__arm__)
+        const long base = long(assembly->base());
+        const long curr = base + long(assembly->size());
+        err = cacheflush(base, curr, 0);
+        LOGE_IF(err, "__ARM_NR_cacheflush error %s\n",
+                strerror(errno));
+#endif
+    }
+
+    pthread_mutex_unlock(&mLock);
+    return err;
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
@@ -0,0 +1,137 @@
+/* libs/pixelflinger/codeflinger/CodeCache.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_CODECACHE_H
+#define ANDROID_CODECACHE_H
+
+#include <stdint.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <cutils/mspace.h>
+
+#include "tinyutils/KeyedVector.h"
+#include "tinyutils/smartpointer.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class AssemblyKeyBase {
+public:
+    virtual ~AssemblyKeyBase() { }
+    virtual int compare_type(const AssemblyKeyBase& key) const = 0;
+};
+
+template  <typename T>
+class AssemblyKey : public AssemblyKeyBase
+{
+public:
+    AssemblyKey(const T& rhs) : mKey(rhs) { }
+    virtual int compare_type(const AssemblyKeyBase& key) const {
+        const T& rhs = static_cast<const AssemblyKey&>(key).mKey;
+        return android::compare_type(mKey, rhs);
+    }
+private:
+    T mKey;
+};
+
+// ----------------------------------------------------------------------------
+
+class Assembly
+{
+public:
+                Assembly(size_t size);
+    virtual     ~Assembly();
+
+    ssize_t     size() const;
+    uint32_t*   base() const;
+    ssize_t     resize(size_t size);
+
+    // protocol for sp<>
+            void    incStrong(const void* id) const;
+            void    decStrong(const void* id) const;
+    typedef void    weakref_type;
+
+private:
+    static  mspace  getMspace();
+            void    ensureMbaseExecutable();
+
+    mutable int32_t     mCount;
+            uint32_t*   mBase;
+            size_t      mSize;
+};
+
+// ----------------------------------------------------------------------------
+
+class CodeCache
+{
+public:
+// pretty simple cache API...
+                CodeCache(size_t size);
+                ~CodeCache();
+    
+            sp<Assembly>        lookup(const AssemblyKeyBase& key) const;
+
+            int                 cache(  const AssemblyKeyBase& key,
+                                        const sp<Assembly>& assembly);
+
+private:
+    // nothing to see here...
+    struct cache_entry_t {
+        inline cache_entry_t() { }
+        inline cache_entry_t(const sp<Assembly>& a, int64_t w)
+                : entry(a), when(w) { }
+        sp<Assembly>            entry;
+        mutable int64_t         when;
+    };
+
+    class key_t {
+        friend int compare_type(
+            const key_value_pair_t<key_t, cache_entry_t>&,
+            const key_value_pair_t<key_t, cache_entry_t>&);
+        const AssemblyKeyBase* mKey;
+    public:
+        key_t() { };
+        key_t(const AssemblyKeyBase& k) : mKey(&k)  { }
+    };
+
+    mutable pthread_mutex_t             mLock;
+    mutable int64_t                     mWhen;
+    size_t                              mCacheSize;
+    size_t                              mCacheInUse;
+    KeyedVector<key_t, cache_entry_t>   mCacheData;
+
+    friend int compare_type(
+        const key_value_pair_t<key_t, cache_entry_t>&,
+        const key_value_pair_t<key_t, cache_entry_t>&);
+};
+
+// KeyedVector uses compare_type(), which is more efficient, than
+// just using operator < ()
+inline int compare_type(
+    const key_value_pair_t<CodeCache::key_t, CodeCache::cache_entry_t>& lhs,
+    const key_value_pair_t<CodeCache::key_t, CodeCache::cache_entry_t>& rhs)
+{
+    return lhs.key.mKey->compare_type(*(rhs.key.mKey));
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
+#endif //ANDROID_CODECACHE_H
@@ -0,0 +1,554 @@
+/* libs/pixelflinger/codeflinger/GGLAssembler.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_GGLASSEMBLER_H
+#define ANDROID_GGLASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <private/pixelflinger/ggl_context.h>
+
+#include "codeflinger/ARMAssemblerProxy.h"
+
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+#define CONTEXT_LOAD(REG, FIELD) \
+    LDR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+#define CONTEXT_STORE(REG, FIELD) \
+    STR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+
+class RegisterAllocator
+{
+public:
+    class RegisterFile;
+    
+    RegisterFile&   registerFile();
+    int             reserveReg(int reg);
+    int             obtainReg();
+    void            recycleReg(int reg);
+    void            reset();
+
+    class RegisterFile
+    {
+    public:
+                            RegisterFile();
+                            RegisterFile(const RegisterFile& rhs);
+                            ~RegisterFile();
+
+                void        reset();
+
+                bool operator == (const RegisterFile& rhs) const;
+                bool operator != (const RegisterFile& rhs) const {
+                    return !operator == (rhs);
+                }
+
+                int         reserve(int reg);
+                void        reserveSeveral(uint32_t regMask);
+
+                void        recycle(int reg);
+                void        recycleSeveral(uint32_t regMask);
+
+                int         obtain();
+        inline  int         isUsed(int reg) const;
+
+                bool        hasFreeRegs() const;
+                int         countFreeRegs() const;                
+
+                uint32_t    touched() const;
+        inline  uint32_t    status() const { return mStatus; }
+        
+        enum {
+            OUT_OF_REGISTERS = 0x1
+        };
+
+    private:
+        uint32_t    mRegs;
+        uint32_t    mTouched;
+        uint32_t    mStatus;
+    };
+ 
+    class Scratch
+    {
+    public:
+            Scratch(RegisterFile& regFile)
+                : mRegFile(regFile), mScratch(0) { 
+            }
+            ~Scratch() {
+                mRegFile.recycleSeveral(mScratch);
+            }
+        int obtain() { 
+            int reg = mRegFile.obtain();
+            mScratch |= 1<<reg;
+            return reg;
+        }
+        void recycle(int reg) {
+            mRegFile.recycle(reg);
+            mScratch &= ~(1<<reg);
+        }
+        bool isUsed(int reg) {
+            return (mScratch & (1<<reg));
+        }
+        int countFreeRegs() {
+            return mRegFile.countFreeRegs();
+        }
+    private:
+        RegisterFile&   mRegFile;
+        uint32_t        mScratch;
+    };
+
+    class Spill
+    {
+    public:
+        Spill(RegisterFile& regFile, ARMAssemblerInterface& gen, uint32_t reglist)
+            : mRegFile(regFile), mGen(gen), mRegList(reglist), mCount(0)
+        {
+            if (reglist) {
+                int count = 0;
+                while (reglist) {
+                    count++;
+                    reglist &= ~(1 << (31 - __builtin_clz(reglist)));
+                }
+                if (count == 1) {
+                    int reg = 31 - __builtin_clz(mRegList);
+                    mGen.STR(mGen.AL, reg, mGen.SP, mGen.immed12_pre(-4, 1));
+                } else {
+                    mGen.STM(mGen.AL, mGen.DB, mGen.SP, 1, mRegList);
+                }
+                mRegFile.recycleSeveral(mRegList);
+                mCount = count;
+            }
+        }
+        ~Spill() {
+            if (mRegList) {
+                if (mCount == 1) {
+                    int reg = 31 - __builtin_clz(mRegList);
+                    mGen.LDR(mGen.AL, reg, mGen.SP, mGen.immed12_post(4));
+                } else {
+                    mGen.LDM(mGen.AL, mGen.IA, mGen.SP, 1, mRegList);
+                }
+                mRegFile.reserveSeveral(mRegList);
+            }
+        }
+    private:
+        RegisterFile&           mRegFile;
+        ARMAssemblerInterface&  mGen;
+        uint32_t                mRegList;
+        int                     mCount;
+    };
+    
+private:
+    RegisterFile    mRegs;
+};
+
+// ----------------------------------------------------------------------------
+
+class GGLAssembler : public ARMAssemblerProxy, public RegisterAllocator
+{
+public:
+
+                    GGLAssembler(ARMAssemblerInterface* target);
+        virtual     ~GGLAssembler();
+
+    uint32_t*   base() const { return 0; } // XXX
+    uint32_t*   pc() const { return 0; } // XXX
+
+    void        reset(int opt_level);
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+
+        // generate scanline code for given needs
+    int         scanline(const needs_t& needs, context_t const* c);
+    int         scanline_core(const needs_t& needs, context_t const* c);
+
+        enum {
+            CLEAR_LO    = 0x0001,
+            CLEAR_HI    = 0x0002,
+            CORRUPTIBLE = 0x0004,
+            FIRST       = 0x0008
+        };
+
+        enum { //load/store flags
+            WRITE_BACK  = 0x0001
+        };
+
+        struct reg_t {
+            reg_t() : reg(-1), flags(0) {
+            }
+            reg_t(int r, int f=0)
+                : reg(r), flags(f) {
+            }
+            void setTo(int r, int f=0) {
+                reg=r; flags=f;
+            }
+            int         reg;
+            uint16_t    flags;
+        };
+
+        struct integer_t : public reg_t {
+            integer_t() : reg_t(), s(0) {
+            }
+            integer_t(int r, int sz=32, int f=0)
+                : reg_t(r, f), s(sz) {
+            }
+            void setTo(int r, int sz=32, int f=0) {
+                reg_t::setTo(r, f); s=sz;
+            }
+            int8_t s;
+            inline int size() const { return s; }
+        };
+        
+        struct pixel_t : public reg_t {
+            pixel_t() : reg_t() {
+                memset(&format, 0, sizeof(GGLFormat));
+            }
+            pixel_t(int r, const GGLFormat* fmt, int f=0)
+                : reg_t(r, f), format(*fmt) {
+            }
+            void setTo(int r, const GGLFormat* fmt, int f=0) {
+                reg_t::setTo(r, f); format = *fmt;
+            }
+            GGLFormat format;
+            inline int hi(int c) const { return format.c[c].h; }
+            inline int low(int c) const { return format.c[c].l; }
+            inline int mask(int c) const { return ((1<<size(c))-1) << low(c); }
+            inline int size() const { return format.size*8; }
+            inline int size(int c) const { return component_size(c); }
+            inline int component_size(int c) const { return hi(c) - low(c); }
+        };
+
+        struct component_t : public reg_t {
+            component_t() : reg_t(), h(0), l(0) {
+            }
+            component_t(int r, int f=0)
+                : reg_t(r, f), h(0), l(0) {
+            }
+            component_t(int r, int lo, int hi, int f=0)
+                : reg_t(r, f), h(hi), l(lo) {
+            }
+            explicit component_t(const integer_t& rhs)
+                : reg_t(rhs.reg, rhs.flags), h(rhs.s), l(0) {
+            }
+            explicit component_t(const pixel_t& rhs, int component) {
+                setTo(  rhs.reg, 
+                        rhs.format.c[component].l,
+                        rhs.format.c[component].h,
+                        rhs.flags|CLEAR_LO|CLEAR_HI);
+            }
+            void setTo(int r, int lo=0, int hi=0, int f=0) {
+                reg_t::setTo(r, f); h=hi; l=lo;
+            }
+            int8_t h;
+            int8_t l;
+            inline int size() const { return h-l; }
+        };
+
+        struct pointer_t : public reg_t {
+            pointer_t() : reg_t(), size(0) {
+            }
+            pointer_t(int r, int s, int f=0)
+                : reg_t(r, f), size(s) {
+            }
+            void setTo(int r, int s, int f=0) {
+                reg_t::setTo(r, f); size=s;
+            }
+            int8_t size;
+        };
+
+
+private:
+    struct tex_coord_t {
+        reg_t       s;
+        reg_t       t;
+        pointer_t   ptr;
+    };
+
+    struct fragment_parts_t {
+        uint32_t    packed  : 1;
+        uint32_t    reload  : 2;
+        uint32_t    iterated_packed  : 1;
+        pixel_t     iterated;
+        pointer_t   cbPtr;
+        pointer_t   covPtr;
+        reg_t       count;
+        reg_t       argb[4];
+        reg_t       argb_dx[4];
+        reg_t       z;
+        reg_t       dither;
+        pixel_t     texel[GGL_TEXTURE_UNIT_COUNT];
+        tex_coord_t coords[GGL_TEXTURE_UNIT_COUNT];
+    };
+    
+    struct texture_unit_t {
+        int         format_idx;
+        GGLFormat   format;
+        int         bits;
+        int         swrap;
+        int         twrap;
+        int         env;
+        int         pot;
+        int         linear;
+        uint8_t     mask;
+        uint8_t     replaced;
+    };
+
+    struct texture_machine_t {
+        texture_unit_t  tmu[GGL_TEXTURE_UNIT_COUNT];
+        uint8_t         mask;
+        uint8_t         replaced;
+        uint8_t         directTexture;
+        uint8_t         activeUnits;
+    };
+
+    struct component_info_t {
+        bool    masked      : 1;
+        bool    inDest      : 1;
+        bool    needed      : 1;
+        bool    replaced    : 1;
+        bool    iterated    : 1;
+        bool    smooth      : 1;
+        bool    blend       : 1;
+        bool    fog         : 1;
+    };
+
+    struct builder_context_t {
+        context_t const*    c;
+        needs_t             needs;
+        int                 Rctx;
+    };
+
+    template <typename T>
+    void modify(T& r, Scratch& regs)
+    {
+        if (!(r.flags & CORRUPTIBLE)) {
+            r.reg = regs.obtain();
+            r.flags |= CORRUPTIBLE;
+        }
+    }
+
+    // helpers
+    void    base_offset(const pointer_t& d, const pointer_t& b, const reg_t& o);
+
+    // texture environement
+    void    modulate(   component_t& dest,
+                        const component_t& incoming,
+                        const pixel_t& texel, int component);
+
+    void    decal(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component);
+
+    void    blend(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component, int tmu);
+
+    void    add(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component);
+
+    // load/store stuff
+    void    store(const pointer_t& addr, const pixel_t& src, uint32_t flags=0);
+    void    load(const pointer_t& addr, const pixel_t& dest, uint32_t flags=0);
+    void    extract(integer_t& d, const pixel_t& s, int component);    
+    void    extract(component_t& d, const pixel_t& s, int component);    
+    void    extract(integer_t& d, int s, int h, int l, int bits=32);
+    void    expand(integer_t& d, const integer_t& s, int dbits);
+    void    expand(integer_t& d, const component_t& s, int dbits);
+    void    expand(component_t& d, const component_t& s, int dbits);
+    void    downshift(pixel_t& d, int component, component_t s, const reg_t& dither);
+
+
+    void    mul_factor( component_t& d,
+                        const integer_t& v,
+                        const integer_t& f);
+
+    void    mul_factor_add( component_t& d,
+                            const integer_t& v,
+                            const integer_t& f,
+                            const component_t& a);
+
+    void    component_add(  component_t& d,
+                            const integer_t& dst,
+                            const integer_t& src);
+
+    void    component_sat(  const component_t& v);
+
+
+    void    build_scanline_prolog(  fragment_parts_t& parts,
+                                    const needs_t& needs);
+
+    void    build_smooth_shade(const fragment_parts_t& parts);
+
+    void    build_component(    pixel_t& pixel,
+                                const fragment_parts_t& parts,
+                                int component,
+                                Scratch& global_scratches);
+                                
+    void    build_incoming_component(
+                                component_t& temp,
+                                int dst_size,
+                                const fragment_parts_t& parts,
+                                int component,
+                                Scratch& scratches,
+                                Scratch& global_scratches);
+
+    void    init_iterated_color(fragment_parts_t& parts, const reg_t& x);
+
+    void    build_iterated_color(   component_t& fragment,
+                                    const fragment_parts_t& parts,
+                                    int component,
+                                    Scratch& regs);
+
+    void    decodeLogicOpNeeds(const needs_t& needs);
+    
+    void    decodeTMUNeeds(const needs_t& needs, context_t const* c);
+
+    void    init_textures(  tex_coord_t* coords,
+                            const reg_t& x,
+                            const reg_t& y);
+
+    void    build_textures( fragment_parts_t& parts,
+                            Scratch& regs);
+
+    void    filter8(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter16(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter24(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter32(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    build_texture_environment(  component_t& fragment,
+                                        const fragment_parts_t& parts,
+                                        int component,
+                                        Scratch& regs);
+
+    void    wrapping(   int d,
+                        int coord, int size,
+                        int tx_wrap, int tx_linear);
+
+    void    build_fog(  component_t& temp,
+                        int component,
+                        Scratch& parent_scratches);
+
+    void    build_blending(     component_t& in_out,
+                                const pixel_t& pixel,
+                                int component,
+                                Scratch& parent_scratches);
+
+    void    build_blend_factor(
+                integer_t& factor, int f, int component,
+                const pixel_t& dst_pixel,
+                integer_t& fragment,
+                integer_t& fb,
+                Scratch& scratches);
+
+    void    build_blendFOneMinusF(  component_t& temp,
+                                    const integer_t& factor, 
+                                    const integer_t& fragment,
+                                    const integer_t& fb);
+
+    void    build_blendOneMinusFF(  component_t& temp,
+                                    const integer_t& factor, 
+                                    const integer_t& fragment,
+                                    const integer_t& fb);
+
+    void build_coverage_application(component_t& fragment,
+                                    const fragment_parts_t& parts,
+                                    Scratch& regs);
+
+    void build_alpha_test(component_t& fragment, const fragment_parts_t& parts);
+
+    enum { Z_TEST=1, Z_WRITE=2 }; 
+    void build_depth_test(const fragment_parts_t& parts, uint32_t mask);
+    void build_iterate_z(const fragment_parts_t& parts);
+    void build_iterate_f(const fragment_parts_t& parts);
+    void build_iterate_texture_coordinates(const fragment_parts_t& parts);
+
+    void build_logic_op(pixel_t& pixel, Scratch& regs);
+
+    void build_masking(pixel_t& pixel, Scratch& regs);
+
+    void build_and_immediate(int d, int s, uint32_t mask, int bits);
+
+    bool    isAlphaSourceNeeded() const;
+
+    enum {
+        FACTOR_SRC=1, FACTOR_DST=2, BLEND_SRC=4, BLEND_DST=8 
+    };
+    
+    enum {
+        LOGIC_OP=1, LOGIC_OP_SRC=2, LOGIC_OP_DST=4
+    };
+
+    static int blending_codes(int fs, int fd);
+
+    builder_context_t   mBuilderContext;
+    texture_machine_t   mTextureMachine;
+    component_info_t    mInfo[4];
+    int                 mBlending;
+    int                 mMasking;
+    int                 mAllMasked;
+    int                 mLogicOp;
+    int                 mAlphaTest;
+    int                 mAA;
+    int                 mDithering;
+    int                 mDepthTest;
+
+    int             mSmooth;
+    int             mFog;
+    pixel_t         mDstPixel;
+    
+    GGLFormat       mCbFormat;
+    
+    int             mBlendFactorCached;
+    integer_t       mAlphaSource;
+    
+    int             mBaseRegister;
+    
+    int             mBlendSrc;
+    int             mBlendDst;
+    int             mBlendSrcA;
+    int             mBlendDstA;
+    
+    int             mOptLevel;
+};
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
+#endif // ANDROID_GGLASSEMBLER_H
@@ -0,0 +1,300 @@
+/*	$NetBSD: armreg.h,v 1.28 2003/10/31 16:30:15 scw Exp $	*/
+
+/*-
+ * Copyright (c) 1998, 2001 Ben Harris
+ * Copyright (c) 1994-1996 Mark Brinicombe.
+ * Copyright (c) 1994 Brini.
+ * All rights reserved.
+ *
+ * This code is derived from software written for Brini by Mark Brinicombe
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Brini.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: /repoman/r/ncvs/src/sys/arm/include/armreg.h,v 1.3 2005/11/21 19:06:25 cognet Exp $
+ */
+
+#ifndef MACHINE_ARMREG_H
+#define MACHINE_ARMREG_H
+#define INSN_SIZE	4
+#define INSN_COND_MASK	0xf0000000	/* Condition mask */
+#define PSR_MODE        0x0000001f      /* mode mask */
+#define PSR_USR26_MODE  0x00000000
+#define PSR_FIQ26_MODE  0x00000001
+#define PSR_IRQ26_MODE  0x00000002
+#define PSR_SVC26_MODE  0x00000003
+#define PSR_USR32_MODE  0x00000010
+#define PSR_FIQ32_MODE  0x00000011
+#define PSR_IRQ32_MODE  0x00000012
+#define PSR_SVC32_MODE  0x00000013
+#define PSR_ABT32_MODE  0x00000017
+#define PSR_UND32_MODE  0x0000001b
+#define PSR_SYS32_MODE  0x0000001f
+#define PSR_32_MODE     0x00000010
+#define PSR_FLAGS	0xf0000000    /* flags */
+
+#define PSR_C_bit (1 << 29)       /* carry */
+
+/* The high-order byte is always the implementor */
+#define CPU_ID_IMPLEMENTOR_MASK	0xff000000
+#define CPU_ID_ARM_LTD		0x41000000 /* 'A' */
+#define CPU_ID_DEC		0x44000000 /* 'D' */
+#define CPU_ID_INTEL		0x69000000 /* 'i' */
+#define	CPU_ID_TI		0x54000000 /* 'T' */
+
+/* How to decide what format the CPUID is in. */
+#define CPU_ID_ISOLD(x)		(((x) & 0x0000f000) == 0x00000000)
+#define CPU_ID_IS7(x)		(((x) & 0x0000f000) == 0x00007000)
+#define CPU_ID_ISNEW(x)		(!CPU_ID_ISOLD(x) && !CPU_ID_IS7(x))
+
+/* On ARM3 and ARM6, this byte holds the foundry ID. */
+#define CPU_ID_FOUNDRY_MASK	0x00ff0000
+#define CPU_ID_FOUNDRY_VLSI	0x00560000
+
+/* On ARM7 it holds the architecture and variant (sub-model) */
+#define CPU_ID_7ARCH_MASK	0x00800000
+#define CPU_ID_7ARCH_V3		0x00000000
+#define CPU_ID_7ARCH_V4T	0x00800000
+#define CPU_ID_7VARIANT_MASK	0x007f0000
+
+/* On more recent ARMs, it does the same, but in a different format */
+#define CPU_ID_ARCH_MASK	0x000f0000
+#define CPU_ID_ARCH_V3		0x00000000
+#define CPU_ID_ARCH_V4		0x00010000
+#define CPU_ID_ARCH_V4T		0x00020000
+#define CPU_ID_ARCH_V5		0x00030000
+#define CPU_ID_ARCH_V5T		0x00040000
+#define CPU_ID_ARCH_V5TE	0x00050000
+#define CPU_ID_VARIANT_MASK	0x00f00000
+
+/* Next three nybbles are part number */
+#define CPU_ID_PARTNO_MASK	0x0000fff0
+
+/* Intel XScale has sub fields in part number */
+#define CPU_ID_XSCALE_COREGEN_MASK	0x0000e000 /* core generation */
+#define CPU_ID_XSCALE_COREREV_MASK	0x00001c00 /* core revision */
+#define CPU_ID_XSCALE_PRODUCT_MASK	0x000003f0 /* product number */
+
+/* And finally, the revision number. */
+#define CPU_ID_REVISION_MASK	0x0000000f
+
+/* Individual CPUs are probably best IDed by everything but the revision. */
+#define CPU_ID_CPU_MASK		0xfffffff0
+
+/* Fake CPU IDs for ARMs without CP15 */
+#define CPU_ID_ARM2		0x41560200
+#define CPU_ID_ARM250		0x41560250
+
+/* Pre-ARM7 CPUs -- [15:12] == 0 */
+#define CPU_ID_ARM3		0x41560300
+#define CPU_ID_ARM600		0x41560600
+#define CPU_ID_ARM610		0x41560610
+#define CPU_ID_ARM620		0x41560620
+
+/* ARM7 CPUs -- [15:12] == 7 */
+#define CPU_ID_ARM700		0x41007000 /* XXX This is a guess. */
+#define CPU_ID_ARM710		0x41007100
+#define CPU_ID_ARM7500		0x41027100 /* XXX This is a guess. */
+#define CPU_ID_ARM710A		0x41047100 /* inc ARM7100 */
+#define CPU_ID_ARM7500FE	0x41077100
+#define CPU_ID_ARM710T		0x41807100
+#define CPU_ID_ARM720T		0x41807200
+#define CPU_ID_ARM740T8K	0x41807400 /* XXX no MMU, 8KB cache */
+#define CPU_ID_ARM740T4K	0x41817400 /* XXX no MMU, 4KB cache */
+
+/* Post-ARM7 CPUs */
+#define CPU_ID_ARM810		0x41018100
+#define CPU_ID_ARM920T		0x41129200
+#define CPU_ID_ARM920T_ALT	0x41009200
+#define CPU_ID_ARM922T		0x41029220
+#define CPU_ID_ARM940T		0x41029400 /* XXX no MMU */
+#define CPU_ID_ARM946ES		0x41049460 /* XXX no MMU */
+#define	CPU_ID_ARM966ES		0x41049660 /* XXX no MMU */
+#define	CPU_ID_ARM966ESR1	0x41059660 /* XXX no MMU */
+#define CPU_ID_ARM1020E		0x4115a200 /* (AKA arm10 rev 1) */
+#define CPU_ID_ARM1022ES	0x4105a220
+#define CPU_ID_SA110		0x4401a100
+#define CPU_ID_SA1100		0x4401a110
+#define	CPU_ID_TI925T		0x54029250
+#define CPU_ID_SA1110		0x6901b110
+#define CPU_ID_IXP1200		0x6901c120
+#define CPU_ID_80200		0x69052000
+#define CPU_ID_PXA250    	0x69052100 /* sans core revision */
+#define CPU_ID_PXA210    	0x69052120
+#define CPU_ID_PXA250A		0x69052100 /* 1st version Core */
+#define CPU_ID_PXA210A		0x69052120 /* 1st version Core */
+#define CPU_ID_PXA250B		0x69052900 /* 3rd version Core */
+#define CPU_ID_PXA210B		0x69052920 /* 3rd version Core */
+#define CPU_ID_PXA250C		0x69052d00 /* 4th version Core */
+#define CPU_ID_PXA210C		0x69052d20 /* 4th version Core */
+#define	CPU_ID_80321_400	0x69052420
+#define	CPU_ID_80321_600	0x69052430
+#define	CPU_ID_80321_400_B0	0x69052c20
+#define	CPU_ID_80321_600_B0	0x69052c30
+#define	CPU_ID_IXP425_533	0x690541c0
+#define	CPU_ID_IXP425_400	0x690541d0
+#define	CPU_ID_IXP425_266	0x690541f0
+
+/* ARM3-specific coprocessor 15 registers */
+#define ARM3_CP15_FLUSH		1
+#define ARM3_CP15_CONTROL	2
+#define ARM3_CP15_CACHEABLE	3
+#define ARM3_CP15_UPDATEABLE	4
+#define ARM3_CP15_DISRUPTIVE	5	
+
+/* ARM3 Control register bits */
+#define ARM3_CTL_CACHE_ON	0x00000001
+#define ARM3_CTL_SHARED		0x00000002
+#define ARM3_CTL_MONITOR	0x00000004
+
+/*
+ * Post-ARM3 CP15 registers:
+ *
+ *	1	Control register
+ *
+ *	2	Translation Table Base
+ *
+ *	3	Domain Access Control
+ *
+ *	4	Reserved
+ *
+ *	5	Fault Status
+ *
+ *	6	Fault Address
+ *
+ *	7	Cache/write-buffer Control
+ *
+ *	8	TLB Control
+ *
+ *	9	Cache Lockdown
+ *
+ *	10	TLB Lockdown
+ *
+ *	11	Reserved
+ *
+ *	12	Reserved
+ *
+ *	13	Process ID (for FCSE)
+ *
+ *	14	Reserved
+ *
+ *	15	Implementation Dependent
+ */
+
+/* Some of the definitions below need cleaning up for V3/V4 architectures */
+
+/* CPU control register (CP15 register 1) */
+#define CPU_CONTROL_MMU_ENABLE	0x00000001 /* M: MMU/Protection unit enable */
+#define CPU_CONTROL_AFLT_ENABLE	0x00000002 /* A: Alignment fault enable */
+#define CPU_CONTROL_DC_ENABLE	0x00000004 /* C: IDC/DC enable */
+#define CPU_CONTROL_WBUF_ENABLE 0x00000008 /* W: Write buffer enable */
+#define CPU_CONTROL_32BP_ENABLE 0x00000010 /* P: 32-bit exception handlers */
+#define CPU_CONTROL_32BD_ENABLE 0x00000020 /* D: 32-bit addressing */
+#define CPU_CONTROL_LABT_ENABLE 0x00000040 /* L: Late abort enable */
+#define CPU_CONTROL_BEND_ENABLE 0x00000080 /* B: Big-endian mode */
+#define CPU_CONTROL_SYST_ENABLE 0x00000100 /* S: System protection bit */
+#define CPU_CONTROL_ROM_ENABLE	0x00000200 /* R: ROM protection bit */
+#define CPU_CONTROL_CPCLK	0x00000400 /* F: Implementation defined */
+#define CPU_CONTROL_BPRD_ENABLE 0x00000800 /* Z: Branch prediction enable */
+#define CPU_CONTROL_IC_ENABLE   0x00001000 /* I: IC enable */
+#define CPU_CONTROL_VECRELOC	0x00002000 /* V: Vector relocation */
+#define CPU_CONTROL_ROUNDROBIN	0x00004000 /* RR: Predictable replacement */
+#define CPU_CONTROL_V4COMPAT	0x00008000 /* L4: ARMv4 compat LDR R15 etc */
+
+#define CPU_CONTROL_IDC_ENABLE	CPU_CONTROL_DC_ENABLE
+
+/* XScale Auxillary Control Register (CP15 register 1, opcode2 1) */
+#define	XSCALE_AUXCTL_K		0x00000001 /* dis. write buffer coalescing */
+#define	XSCALE_AUXCTL_P		0x00000002 /* ECC protect page table access */
+#define	XSCALE_AUXCTL_MD_WB_RA	0x00000000 /* mini-D$ wb, read-allocate */
+#define	XSCALE_AUXCTL_MD_WB_RWA	0x00000010 /* mini-D$ wb, read/write-allocate */
+#define	XSCALE_AUXCTL_MD_WT	0x00000020 /* mini-D$ wt, read-allocate */
+#define	XSCALE_AUXCTL_MD_MASK	0x00000030
+
+/* Cache type register definitions */
+#define	CPU_CT_ISIZE(x)		((x) & 0xfff)		/* I$ info */
+#define	CPU_CT_DSIZE(x)		(((x) >> 12) & 0xfff)	/* D$ info */
+#define	CPU_CT_S		(1U << 24)		/* split cache */
+#define	CPU_CT_CTYPE(x)		(((x) >> 25) & 0xf)	/* cache type */
+
+#define	CPU_CT_CTYPE_WT		0	/* write-through */
+#define	CPU_CT_CTYPE_WB1	1	/* write-back, clean w/ read */
+#define	CPU_CT_CTYPE_WB2	2	/* w/b, clean w/ cp15,7 */
+#define	CPU_CT_CTYPE_WB6	6	/* w/b, cp15,7, lockdown fmt A */
+#define	CPU_CT_CTYPE_WB7	7	/* w/b, cp15,7, lockdown fmt B */
+
+#define	CPU_CT_xSIZE_LEN(x)	((x) & 0x3)		/* line size */
+#define	CPU_CT_xSIZE_M		(1U << 2)		/* multiplier */
+#define	CPU_CT_xSIZE_ASSOC(x)	(((x) >> 3) & 0x7)	/* associativity */
+#define	CPU_CT_xSIZE_SIZE(x)	(((x) >> 6) & 0x7)	/* size */
+
+/* Fault status register definitions */
+
+#define FAULT_TYPE_MASK 0x0f
+#define FAULT_USER      0x10
+
+#define FAULT_WRTBUF_0  0x00 /* Vector Exception */
+#define FAULT_WRTBUF_1  0x02 /* Terminal Exception */
+#define FAULT_BUSERR_0  0x04 /* External Abort on Linefetch -- Section */
+#define FAULT_BUSERR_1  0x06 /* External Abort on Linefetch -- Page */
+#define FAULT_BUSERR_2  0x08 /* External Abort on Non-linefetch -- Section */
+#define FAULT_BUSERR_3  0x0a /* External Abort on Non-linefetch -- Page */
+#define FAULT_BUSTRNL1  0x0c /* External abort on Translation -- Level 1 */
+#define FAULT_BUSTRNL2  0x0e /* External abort on Translation -- Level 2 */
+#define FAULT_ALIGN_0   0x01 /* Alignment */
+#define FAULT_ALIGN_1   0x03 /* Alignment */
+#define FAULT_TRANS_S   0x05 /* Translation -- Section */
+#define FAULT_TRANS_P   0x07 /* Translation -- Page */
+#define FAULT_DOMAIN_S  0x09 /* Domain -- Section */
+#define FAULT_DOMAIN_P  0x0b /* Domain -- Page */
+#define FAULT_PERM_S    0x0d /* Permission -- Section */
+#define FAULT_PERM_P    0x0f /* Permission -- Page */
+
+#define	FAULT_IMPRECISE	0x400	/* Imprecise exception (XSCALE) */
+
+/*
+ * Address of the vector page, low and high versions.
+ */
+#define	ARM_VECTORS_LOW		0x00000000U
+#define	ARM_VECTORS_HIGH	0xffff0000U
+
+/*
+ * ARM Instructions
+ *
+ *       3 3 2 2 2                              
+ *       1 0 9 8 7                                                     0
+ *      +-------+-------------------------------------------------------+
+ *      | cond  |              instruction dependant                    |
+ *      |c c c c|                                                       |
+ *      +-------+-------------------------------------------------------+
+ */
+
+#define INSN_SIZE		4		/* Always 4 bytes */
+#define INSN_COND_MASK		0xf0000000	/* Condition mask */
+#define INSN_COND_AL		0xe0000000	/* Always condition */
+
+#endif /* !MACHINE_ARMREG_H */
@@ -0,0 +1,672 @@
+/* libs/pixelflinger/codeflinger/blending.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+
+#include <cutils/log.h>
+
+#include "codeflinger/GGLAssembler.h"
+
+
+namespace android {
+
+void GGLAssembler::build_fog(
+                        component_t& temp,      // incomming fragment / output
+                        int component,
+                        Scratch& regs)
+{
+   if (mInfo[component].fog) {
+        Scratch scratches(registerFile());
+        comment("fog");
+
+        integer_t fragment(temp.reg, temp.h, temp.flags);
+        if (!(temp.flags & CORRUPTIBLE)) {
+            temp.reg = regs.obtain();
+            temp.flags |= CORRUPTIBLE;
+        }
+
+        integer_t fogColor(scratches.obtain(), 8, CORRUPTIBLE); 
+        LDRB(AL, fogColor.reg, mBuilderContext.Rctx,
+                immed12_pre(GGL_OFFSETOF(state.fog.color[component])));
+
+        integer_t factor(scratches.obtain(), 16, CORRUPTIBLE);
+        CONTEXT_LOAD(factor.reg, generated_vars.f);
+
+        // clamp fog factor (TODO: see if there is a way to guarantee
+        // we won't overflow, when setting the iterators)
+        BIC(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, ASR, 31));
+        CMP(AL, factor.reg, imm( 0x10000 ));
+        MOV(HS, 0, factor.reg, imm( 0x10000 ));
+
+        build_blendFOneMinusF(temp, factor, fragment, fogColor);
+    }
+}
+
+void GGLAssembler::build_blending(
+                        component_t& temp,      // incomming fragment / output
+                        const pixel_t& pixel,   // framebuffer
+                        int component,
+                        Scratch& regs)
+{
+   if (!mInfo[component].blend)
+        return;
+        
+    int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+    int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+    if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA)
+        fs = GGL_ONE;
+    const int blending = blending_codes(fs, fd);
+    if (!temp.size()) {
+        // here, blending will produce something which doesn't depend on
+        // that component (eg: GL_ZERO:GL_*), so the register has not been
+        // allocated yet. Will never be used as a source.
+        temp = component_t(regs.obtain(), CORRUPTIBLE);
+    }
+
+    // we are doing real blending...
+    // fb:          extracted dst
+    // fragment:    extracted src
+    // temp:        component_t(fragment) and result
+
+    // scoped register allocator
+    Scratch scratches(registerFile());
+    comment("blending");
+
+    // we can optimize these cases a bit...
+    // (1) saturation is not needed
+    // (2) we can use only one multiply instead of 2
+    // (3) we can reduce the register pressure
+    //      R = S*f + D*(1-f) = (S-D)*f + D
+    //      R = S*(1-f) + D*f = (D-S)*f + S
+
+    const bool same_factor_opt1 =
+        (fs==GGL_DST_COLOR && fd==GGL_ONE_MINUS_DST_COLOR) ||
+        (fs==GGL_SRC_COLOR && fd==GGL_ONE_MINUS_SRC_COLOR) ||
+        (fs==GGL_DST_ALPHA && fd==GGL_ONE_MINUS_DST_ALPHA) ||
+        (fs==GGL_SRC_ALPHA && fd==GGL_ONE_MINUS_SRC_ALPHA);
+
+    const bool same_factor_opt2 =
+        (fs==GGL_ONE_MINUS_DST_COLOR && fd==GGL_DST_COLOR) ||
+        (fs==GGL_ONE_MINUS_SRC_COLOR && fd==GGL_SRC_COLOR) || 
+        (fs==GGL_ONE_MINUS_DST_ALPHA && fd==GGL_DST_ALPHA) ||
+        (fs==GGL_ONE_MINUS_SRC_ALPHA && fd==GGL_SRC_ALPHA);
+
+
+    // XXX: we could also optimize these cases:
+    // R = S*f + D*f = (S+D)*f
+    // R = S*(1-f) + D*(1-f) = (S+D)*(1-f)
+    // R = S*D + D*S = 2*S*D
+
+
+    // see if we need to extract 'component' from the destination (fb)
+    integer_t fb;
+    if (blending & (BLEND_DST|FACTOR_DST)) { 
+        fb.setTo(scratches.obtain(), 32); 
+        extract(fb, pixel, component);
+        if (mDithering) {
+            // XXX: maybe what we should do instead, is simply
+            // expand fb -or- fragment to the larger of the two
+            if (fb.size() < temp.size()) {
+                // for now we expand 'fb' to min(fragment, 8)
+                int new_size = temp.size() < 8 ? temp.size() : 8;
+                expand(fb, fb, new_size);
+            }
+        }
+    }
+
+
+    // convert input fragment to integer_t
+    if (temp.l && (temp.flags & CORRUPTIBLE)) {
+        MOV(AL, 0, temp.reg, reg_imm(temp.reg, LSR, temp.l));
+        temp.h -= temp.l;
+        temp.l = 0;
+    }
+    integer_t fragment(temp.reg, temp.size(), temp.flags);
+
+    // if not done yet, convert input fragment to integer_t
+    if (temp.l) {
+        // here we know temp is not CORRUPTIBLE
+        fragment.reg = scratches.obtain();
+        MOV(AL, 0, fragment.reg, reg_imm(temp.reg, LSR, temp.l));
+        fragment.flags |= CORRUPTIBLE;
+    }
+
+    if (!(temp.flags & CORRUPTIBLE)) {
+        // temp is not corruptible, but since it's the destination it
+        // will be modified, so we need to allocate a new register.
+        temp.reg = regs.obtain();
+        temp.flags &= ~CORRUPTIBLE;
+        fragment.flags &= ~CORRUPTIBLE;
+    }
+
+    if ((blending & BLEND_SRC) && !same_factor_opt1) {
+        // source (fragment) is needed for the blending stage
+        // so it's not CORRUPTIBLE (unless we're doing same_factor_opt1)
+        fragment.flags &= ~CORRUPTIBLE;
+    }
+
+
+    if (same_factor_opt1) {
+        //  R = S*f + D*(1-f) = (S-D)*f + D
+        integer_t factor;
+        build_blend_factor(factor, fs, 
+                component, pixel, fragment, fb, scratches);
+        // fb is always corruptible from this point
+        fb.flags |= CORRUPTIBLE;
+        build_blendFOneMinusF(temp, factor, fragment, fb);
+    } else if (same_factor_opt2) {
+        //  R = S*(1-f) + D*f = (D-S)*f + S
+        integer_t factor;
+        // fb is always corrruptible here
+        fb.flags |= CORRUPTIBLE;
+        build_blend_factor(factor, fd,
+                component, pixel, fragment, fb, scratches);
+        build_blendOneMinusFF(temp, factor, fragment, fb);
+    } else {
+        integer_t src_factor;
+        integer_t dst_factor;
+
+        // if destination (fb) is not needed for the blending stage, 
+        // then it can be marked as CORRUPTIBLE
+        if (!(blending & BLEND_DST)) {
+            fb.flags |= CORRUPTIBLE;
+        }
+
+        // XXX: try to mark some registers as CORRUPTIBLE
+        // in most case we could make those corruptible
+        // when we're processing the last component
+        // but not always, for instance
+        //    when fragment is constant and not reloaded
+        //    when fb is needed for logic-ops or masking
+        //    when a register is aliased (for instance with mAlphaSource)
+
+        // blend away...
+        if (fs==GGL_ZERO) {
+            if (fd==GGL_ZERO) {         // R = 0
+                // already taken care of
+            } else if (fd==GGL_ONE) {   // R = D
+                // already taken care of
+            } else {                    // R = D*fd
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor(temp, fb, dst_factor);
+            }
+        } else if (fs==GGL_ONE) {
+            if (fd==GGL_ZERO) {         // R = S
+                // NOP, taken care of
+            } else if (fd==GGL_ONE) {   // R = S + D
+                component_add(temp, fb, fragment); // args order matters
+                component_sat(temp);
+            } else {                    // R = S + D*fd
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor_add(temp, fb, dst_factor, component_t(fragment));
+                component_sat(temp);
+            }
+        } else {
+            // compute fs
+            build_blend_factor(src_factor, fs, 
+                    component, pixel, fragment, fb, scratches);
+            if (fd==GGL_ZERO) {         // R = S*fs
+                mul_factor(temp, fragment, src_factor);
+            } else if (fd==GGL_ONE) {   // R = S*fs + D
+                mul_factor_add(temp, fragment, src_factor, component_t(fb));
+                component_sat(temp);
+            } else {                    // R = S*fs + D*fd
+                mul_factor(temp, fragment, src_factor);
+                if (scratches.isUsed(src_factor.reg))
+                    scratches.recycle(src_factor.reg);
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor_add(temp, fb, dst_factor, temp);
+                if (!same_factor_opt1 && !same_factor_opt2) {
+                    component_sat(temp);
+                }
+            }
+        }
+    }
+
+    // now we can be corrupted (it's the dest)
+    temp.flags |= CORRUPTIBLE;
+}
+
+void GGLAssembler::build_blend_factor(
+        integer_t& factor, int f, int component,
+        const pixel_t& dst_pixel,
+        integer_t& fragment,
+        integer_t& fb,
+        Scratch& scratches)
+{
+    integer_t src_alpha(fragment);
+
+    // src_factor/dst_factor won't be used after blending,
+    // so it's fine to mark them as CORRUPTIBLE (if not aliased)
+    factor.flags |= CORRUPTIBLE;
+
+    switch(f) {
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        if (component==GGLFormat::ALPHA && !isAlphaSourceNeeded()) {
+            // we're processing alpha, so we already have
+            // src-alpha in fragment, and we need src-alpha just this time.
+        } else {
+           // alpha-src will be needed for other components
+            if (!mBlendFactorCached || mBlendFactorCached==f) {
+                src_alpha = mAlphaSource;
+                factor = mAlphaSource;
+                factor.flags &= ~CORRUPTIBLE;           
+                // we already computed the blend factor before, nothing to do.
+                if (mBlendFactorCached)
+                    return;
+                // this is the first time, make sure to compute the blend
+                // factor properly.
+                mBlendFactorCached = f;
+                break;
+            } else {
+                // we have a cached alpha blend factor, but we want another one,
+                // this should really not happen because by construction,
+                // we cannot have BOTH source and destination
+                // blend factors use ALPHA *and* ONE_MINUS_ALPHA (because
+                // the blending stage uses the f/(1-f) optimization
+                
+                // for completeness, we handle this case though. Since there
+                // are only 2 choices, this meens we want "the other one"
+                // (1-factor)
+                factor = mAlphaSource;
+                factor.flags &= ~CORRUPTIBLE;           
+                RSB(AL, 0, factor.reg, factor.reg, imm((1<<factor.s)));
+                mBlendFactorCached = f;
+                return;
+            }                
+        }
+        // fall-through...
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+    case GGL_SRC_ALPHA_SATURATE:
+        // help us find out what register we can use for the blend-factor
+        // CORRUPTIBLE registers are chosen first, or a new one is allocated.
+        if (fragment.flags & CORRUPTIBLE) {
+            factor.setTo(fragment.reg, 32, CORRUPTIBLE);
+            fragment.flags &= ~CORRUPTIBLE;
+        } else if (fb.flags & CORRUPTIBLE) {
+            factor.setTo(fb.reg, 32, CORRUPTIBLE);
+            fb.flags &= ~CORRUPTIBLE;
+        } else {
+            factor.setTo(scratches.obtain(), 32, CORRUPTIBLE);
+        } 
+        break;
+    }
+
+    // XXX: doesn't work if size==1
+
+    switch(f) {
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        factor.s = fb.s;
+        ADD(AL, 0, factor.reg, fb.reg, reg_imm(fb.reg, LSR, fb.s-1));
+        break;
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+        factor.s = fragment.s;
+        ADD(AL, 0, factor.reg, fragment.reg,
+            reg_imm(fragment.reg, LSR, fragment.s-1));
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        factor.s = src_alpha.s;
+        ADD(AL, 0, factor.reg, src_alpha.reg,
+                reg_imm(src_alpha.reg, LSR, src_alpha.s-1));
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        // XXX: should be precomputed
+        extract(factor, dst_pixel, GGLFormat::ALPHA);
+        ADD(AL, 0, factor.reg, factor.reg,
+                reg_imm(factor.reg, LSR, factor.s-1));
+        break;
+    case GGL_SRC_ALPHA_SATURATE:
+        // XXX: should be precomputed
+        // XXX: f = min(As, 1-Ad)
+        // btw, we're guaranteed that Ad's size is <= 8, because
+        // it's extracted from the framebuffer
+        break;
+    }
+
+    switch(f) {
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_ONE_MINUS_SRC_ALPHA:
+        RSB(AL, 0, factor.reg, factor.reg, imm((1<<factor.s)));
+    }
+    
+    // don't need more than 8-bits for the blend factor
+    // and this will prevent overflows in the multiplies later
+    if (factor.s > 8) {
+        MOV(AL, 0, factor.reg, reg_imm(factor.reg, LSR, factor.s-8));
+        factor.s = 8;
+    }
+}
+
+int GGLAssembler::blending_codes(int fs, int fd)
+{
+    int blending = 0;
+    switch(fs) {
+    case GGL_ONE:
+        blending |= BLEND_SRC;
+        break;
+
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        blending |= FACTOR_DST|BLEND_SRC;
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        // no need to extract 'component' from the destination
+        // for the blend factor, because we need ALPHA only.
+        blending |= BLEND_SRC;
+        break;
+
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:    
+        blending |= FACTOR_SRC|BLEND_SRC;
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+    case GGL_SRC_ALPHA_SATURATE:
+        blending |= FACTOR_SRC|BLEND_SRC;
+        break;
+    }
+    switch(fd) {
+    case GGL_ONE:
+        blending |= BLEND_DST;
+        break;
+
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        blending |= FACTOR_DST|BLEND_DST;
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        blending |= FACTOR_DST|BLEND_DST;
+        break;
+
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:    
+        blending |= FACTOR_SRC|BLEND_DST;
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        // no need to extract 'component' from the source
+        // for the blend factor, because we need ALPHA only.
+        blending |= BLEND_DST;
+        break;
+    }
+    return blending;
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_blendFOneMinusF(
+        component_t& temp,
+        const integer_t& factor, 
+        const integer_t& fragment,
+        const integer_t& fb)
+{
+    //  R = S*f + D*(1-f) = (S-D)*f + D
+    Scratch scratches(registerFile());
+    // compute S-D
+    integer_t diff(fragment.flags & CORRUPTIBLE ?
+            fragment.reg : scratches.obtain(), fb.size(), CORRUPTIBLE);
+    const int shift = fragment.size() - fb.size();
+    if (shift>0)        RSB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSR, shift));
+    else if (shift<0)   RSB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSL,-shift));
+    else                RSB(AL, 0, diff.reg, fb.reg, fragment.reg);
+    mul_factor_add(temp, diff, factor, component_t(fb));
+}
+
+void GGLAssembler::build_blendOneMinusFF(
+        component_t& temp,
+        const integer_t& factor, 
+        const integer_t& fragment,
+        const integer_t& fb)
+{
+    //  R = S*f + D*(1-f) = (S-D)*f + D
+    Scratch scratches(registerFile());
+    // compute D-S
+    integer_t diff(fb.flags & CORRUPTIBLE ?
+            fb.reg : scratches.obtain(), fb.size(), CORRUPTIBLE);
+    const int shift = fragment.size() - fb.size();
+    if (shift>0)        SUB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSR, shift));
+    else if (shift<0)   SUB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSL,-shift));
+    else                SUB(AL, 0, diff.reg, fb.reg, fragment.reg);
+    mul_factor_add(temp, diff, factor, component_t(fragment));
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::mul_factor(  component_t& d,
+                                const integer_t& v,
+                                const integer_t& f)
+{
+    int vs = v.size();
+    int fs = f.size();
+    int ms = vs+fs;
+
+    // XXX: we could have special cases for 1 bit mul
+
+    // all this code below to use the best multiply instruction
+    // wrt the parameters size. We take advantage of the fact
+    // that the 16-bits multiplies allow a 16-bit shift
+    // The trick is that we just make sure that we have at least 8-bits
+    // per component (which is enough for a 8 bits display).
+
+    int xy;
+    int vshift = 0;
+    int fshift = 0;
+    int smulw = 0;
+
+    if (vs<16) {
+        if (fs<16) {
+            xy = xyBB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            ms -= 16;
+            xy = xyTB;
+        } else {
+            // eg: 15 * 18  ->  15 * 15
+            fshift = fs - 15;
+            ms -= fshift;
+            xy = xyBB;
+        }
+    } else if (GGL_BETWEEN(vs, 24, 31)) {
+        if (fs<16) {
+            ms -= 16;
+            xy = xyTB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            ms -= 32;
+            xy = xyTT;
+        } else {
+            // eg: 24 * 18  ->  8 * 18
+            fshift = fs - 15;
+            ms -= 16 + fshift;
+            xy = xyTB;
+        }
+    } else {
+        if (fs<16) {
+            // eg: 18 * 15  ->  15 * 15
+            vshift = vs - 15;
+            ms -= vshift;
+            xy = xyBB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            // eg: 18 * 24  ->  15 * 8
+            vshift = vs - 15;
+            ms -= 16 + vshift;
+            xy = xyBT;
+        } else {
+            // eg: 18 * 18  ->  (15 * 18)>>16
+            fshift = fs - 15;
+            ms -= 16 + fshift;
+            xy = yB;    //XXX SMULWB
+            smulw = 1;
+        }
+    }
+
+    LOGE_IF(ms>=32, "mul_factor overflow vs=%d, fs=%d", vs, fs);
+
+    int vreg = v.reg;
+    int freg = f.reg;
+    if (vshift) {
+        MOV(AL, 0, d.reg, reg_imm(vreg, LSR, vshift));
+        vreg = d.reg;
+    }
+    if (fshift) {
+        MOV(AL, 0, d.reg, reg_imm(vreg, LSR, fshift));
+        freg = d.reg;
+    }
+    if (smulw)  SMULW(AL, xy, d.reg, vreg, freg);
+    else        SMUL(AL, xy, d.reg, vreg, freg);
+
+
+    d.h = ms;
+    if (mDithering) {
+        d.l = 0; 
+    } else {
+        d.l = fs; 
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::mul_factor_add(  component_t& d,
+                                    const integer_t& v,
+                                    const integer_t& f,
+                                    const component_t& a)
+{
+    // XXX: we could have special cases for 1 bit mul
+    Scratch scratches(registerFile());
+
+    int vs = v.size();
+    int fs = f.size();
+    int as = a.h;
+    int ms = vs+fs;
+
+    LOGE_IF(ms>=32, "mul_factor_add overflow vs=%d, fs=%d, as=%d", vs, fs, as);
+
+    integer_t add(a.reg, a.h, a.flags);
+
+    // 'a' is a component_t but it is guaranteed to have
+    // its high bits set to 0. However in the dithering case,
+    // we can't get away with truncating the potentially bad bits
+    // so extraction is needed.
+
+   if ((mDithering) && (a.size() < ms)) {
+        // we need to expand a
+        if (!(a.flags & CORRUPTIBLE)) {
+            // ... but it's not corruptible, so we need to pick a
+            // temporary register.
+            // Try to uses the destination register first (it's likely
+            // to be usable, unless it aliases an input).
+            if (d.reg!=a.reg && d.reg!=v.reg && d.reg!=f.reg) {
+                add.reg = d.reg;
+            } else {
+                add.reg = scratches.obtain();
+            }
+        }
+        expand(add, a, ms); // extracts and expands
+        as = ms;
+    }
+
+    if (ms == as) {
+        if (vs<16 && fs<16) SMLABB(AL, d.reg, v.reg, f.reg, add.reg);
+        else                MLA(AL, 0, d.reg, v.reg, f.reg, add.reg);
+    } else {
+        int temp = d.reg;
+        if (temp == add.reg) {
+            // the mul will modify add.reg, we need an intermediary reg
+            if (v.flags & CORRUPTIBLE)      temp = v.reg;
+            else if (f.flags & CORRUPTIBLE) temp = f.reg;
+            else                            temp = scratches.obtain();
+        }
+
+        if (vs<16 && fs<16) SMULBB(AL, temp, v.reg, f.reg);
+        else                MUL(AL, 0, temp, v.reg, f.reg);
+
+        if (ms>as) {
+            ADD(AL, 0, d.reg, temp, reg_imm(add.reg, LSL, ms-as));
+        } else if (ms<as) {
+            // not sure if we should expand the mul instead?
+            ADD(AL, 0, d.reg, temp, reg_imm(add.reg, LSR, as-ms));
+        }
+    }
+
+    d.h = ms;
+    if (mDithering) {
+        d.l = a.l; 
+    } else {
+        d.l = fs>a.l ? fs : a.l;
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::component_add(component_t& d,
+        const integer_t& dst, const integer_t& src)
+{
+    // here we're guaranteed that fragment.size() >= fb.size()
+    const int shift = src.size() - dst.size();
+    if (!shift) {
+        ADD(AL, 0, d.reg, src.reg, dst.reg);
+    } else {
+        ADD(AL, 0, d.reg, src.reg, reg_imm(dst.reg, LSL, shift));
+    }
+
+    d.h = src.size();
+    if (mDithering) {
+        d.l = 0;
+    } else {
+        d.l = shift;
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::component_sat(const component_t& v)
+{
+    const int one = ((1<<v.size())-1)<<v.l;
+    CMP(AL, v.reg, imm( 1<<v.h ));
+    if (isValidImmediate(one)) {
+        MOV(HS, 0, v.reg, imm( one ));
+    } else if (isValidImmediate(~one)) {
+        MVN(HS, 0, v.reg, imm( ~one ));
+    } else {
+        MOV(HS, 0, v.reg, imm( 1<<v.h ));
+        SUB(HS, 0, v.reg, v.reg, imm( 1<<v.l ));
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
@@ -0,0 +1,708 @@
+/*	$NetBSD: disassem.c,v 1.14 2003/03/27 16:58:36 mycroft Exp $	*/
+
+/*-
+ * Copyright (c) 1996 Mark Brinicombe.
+ * Copyright (c) 1996 Brini.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Brini.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * RiscBSD kernel project
+ *
+ * db_disasm.c
+ *
+ * Kernel disassembler
+ *
+ * Created      : 10/02/96
+ *
+ * Structured after the sparc/sparc/db_disasm.c by David S. Miller &
+ * Paul Kranenburg
+ *
+ * This code is not complete. Not all instructions are disassembled.
+ */
+
+#include <sys/cdefs.h>
+//__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/arm/arm/disassem.c,v 1.2 2005/01/05 21:58:47 imp Exp $");
+#include <sys/param.h>
+#include <stdio.h>
+
+#include "disassem.h"
+#include "armreg.h"
+//#include <ddb/ddb.h>
+
+/*
+ * General instruction format
+ *
+ *	insn[cc][mod]	[operands]
+ *
+ * Those fields with an uppercase format code indicate that the field
+ * follows directly after the instruction before the separator i.e.
+ * they modify the instruction rather than just being an operand to
+ * the instruction. The only exception is the writeback flag which
+ * follows a operand.
+ *
+ *
+ * 2 - print Operand 2 of a data processing instruction
+ * d - destination register (bits 12-15)
+ * n - n register (bits 16-19)
+ * s - s register (bits 8-11)
+ * o - indirect register rn (bits 16-19) (used by swap)
+ * m - m register (bits 0-3)
+ * a - address operand of ldr/str instruction
+ * e - address operand of ldrh/strh instruction
+ * l - register list for ldm/stm instruction
+ * f - 1st fp operand (register) (bits 12-14)
+ * g - 2nd fp operand (register) (bits 16-18)
+ * h - 3rd fp operand (register/immediate) (bits 0-4)
+ * j - xtb rotate literal (bits 10-11)
+ * b - branch address
+ * t - thumb branch address (bits 24, 0-23)
+ * k - breakpoint comment (bits 0-3, 8-19)
+ * X - block transfer type
+ * Y - block transfer type (r13 base)
+ * c - comment field bits(0-23)
+ * p - saved or current status register
+ * F - PSR transfer fields
+ * D - destination-is-r15 (P) flag on TST, TEQ, CMP, CMN
+ * L - co-processor transfer size
+ * S - set status flag
+ * P - fp precision
+ * Q - fp precision (for ldf/stf)
+ * R - fp rounding
+ * v - co-processor data transfer registers + addressing mode
+ * W - writeback flag
+ * x - instruction in hex
+ * # - co-processor number
+ * y - co-processor data processing registers
+ * z - co-processor register transfer registers
+ */
+
+struct arm32_insn {
+	u_int mask;
+	u_int pattern;
+	char* name;
+	char* format;
+};
+
+static const struct arm32_insn arm32_i[] = {
+    { 0x0fffffff, 0x0ff00000, "imb",	"c" },		/* Before swi */
+    { 0x0fffffff, 0x0ff00001, "imbrange",	"c" },	/* Before swi */
+    { 0x0f000000, 0x0f000000, "swi",	"c" },
+    { 0xfe000000, 0xfa000000, "blx",	"t" },		/* Before b and bl */
+    { 0x0f000000, 0x0a000000, "b",	"b" },
+    { 0x0f000000, 0x0b000000, "bl",	"b" },
+    { 0x0fe000f0, 0x00000090, "mul",	"Snms" },
+    { 0x0fe000f0, 0x00200090, "mla",	"Snmsd" },
+    { 0x0fe000f0, 0x00800090, "umull",	"Sdnms" },
+    { 0x0fe000f0, 0x00c00090, "smull",	"Sdnms" },
+    { 0x0fe000f0, 0x00a00090, "umlal",	"Sdnms" },
+    { 0x0fe000f0, 0x00e00090, "smlal",	"Sdnms" },
+    { 0x0fff03f0, 0x06cf0070, "uxtb16", "dmj" },
+    { 0x0d700000, 0x04200000, "strt",	"daW" },
+    { 0x0d700000, 0x04300000, "ldrt",	"daW" },
+    { 0x0d700000, 0x04600000, "strbt",	"daW" },
+    { 0x0d700000, 0x04700000, "ldrbt",	"daW" },
+    { 0x0c500000, 0x04000000, "str",	"daW" },
+    { 0x0c500000, 0x04100000, "ldr",	"daW" },
+    { 0x0c500000, 0x04400000, "strb",	"daW" },
+    { 0x0c500000, 0x04500000, "ldrb",	"daW" },
+    { 0x0e1f0000, 0x080d0000, "stm",	"YnWl" },/* separate out r13 base */
+    { 0x0e1f0000, 0x081d0000, "ldm",	"YnWl" },/* separate out r13 base */    
+    { 0x0e100000, 0x08000000, "stm",	"XnWl" },
+    { 0x0e100000, 0x08100000, "ldm",	"XnWl" },    
+    { 0x0e1000f0, 0x00100090, "ldrb",	"deW" },
+    { 0x0e1000f0, 0x00000090, "strb",	"deW" },
+    { 0x0e1000f0, 0x001000d0, "ldrsb",	"deW" },
+    { 0x0e1000f0, 0x001000b0, "ldrh",	"deW" },
+    { 0x0e1000f0, 0x000000b0, "strh",	"deW" },
+    { 0x0e1000f0, 0x001000f0, "ldrsh",	"deW" },
+    { 0x0f200090, 0x00200090, "und",	"x" },	/* Before data processing */
+    { 0x0e1000d0, 0x000000d0, "und",	"x" },	/* Before data processing */
+    { 0x0ff00ff0, 0x01000090, "swp",	"dmo" },
+    { 0x0ff00ff0, 0x01400090, "swpb",	"dmo" },
+    { 0x0fbf0fff, 0x010f0000, "mrs",	"dp" },	/* Before data processing */
+    { 0x0fb0fff0, 0x0120f000, "msr",	"pFm" },/* Before data processing */
+    { 0x0fb0f000, 0x0320f000, "msr",	"pF2" },/* Before data processing */
+    { 0x0ffffff0, 0x012fff10, "bx",     "m" },
+    { 0x0fff0ff0, 0x016f0f10, "clz",	"dm" },
+    { 0x0ffffff0, 0x012fff30, "blx",	"m" },
+    { 0xfff000f0, 0xe1200070, "bkpt",	"k" },
+    { 0x0de00000, 0x00000000, "and",	"Sdn2" },
+    { 0x0de00000, 0x00200000, "eor",	"Sdn2" },
+    { 0x0de00000, 0x00400000, "sub",	"Sdn2" },
+    { 0x0de00000, 0x00600000, "rsb",	"Sdn2" },
+    { 0x0de00000, 0x00800000, "add",	"Sdn2" },
+    { 0x0de00000, 0x00a00000, "adc",	"Sdn2" },
+    { 0x0de00000, 0x00c00000, "sbc",	"Sdn2" },
+    { 0x0de00000, 0x00e00000, "rsc",	"Sdn2" },
+    { 0x0df00000, 0x01100000, "tst",	"Dn2" },
+    { 0x0df00000, 0x01300000, "teq",	"Dn2" },
+    { 0x0df00000, 0x01500000, "cmp",	"Dn2" },
+    { 0x0df00000, 0x01700000, "cmn",	"Dn2" },
+    { 0x0de00000, 0x01800000, "orr",	"Sdn2" },
+    { 0x0de00000, 0x01a00000, "mov",	"Sd2" },
+    { 0x0de00000, 0x01c00000, "bic",	"Sdn2" },
+    { 0x0de00000, 0x01e00000, "mvn",	"Sd2" },
+    { 0x0ff08f10, 0x0e000100, "adf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e100100, "muf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e200100, "suf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e300100, "rsf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e400100, "dvf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e500100, "rdf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e600100, "pow",	"PRfgh" },
+    { 0x0ff08f10, 0x0e700100, "rpw",	"PRfgh" },
+    { 0x0ff08f10, 0x0e800100, "rmf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e900100, "fml",	"PRfgh" },
+    { 0x0ff08f10, 0x0ea00100, "fdv",	"PRfgh" },
+    { 0x0ff08f10, 0x0eb00100, "frd",	"PRfgh" },
+    { 0x0ff08f10, 0x0ec00100, "pol",	"PRfgh" },
+    { 0x0f008f10, 0x0e000100, "fpbop",	"PRfgh" },
+    { 0x0ff08f10, 0x0e008100, "mvf",	"PRfh" },
+    { 0x0ff08f10, 0x0e108100, "mnf",	"PRfh" },
+    { 0x0ff08f10, 0x0e208100, "abs",	"PRfh" },
+    { 0x0ff08f10, 0x0e308100, "rnd",	"PRfh" },
+    { 0x0ff08f10, 0x0e408100, "sqt",	"PRfh" },
+    { 0x0ff08f10, 0x0e508100, "log",	"PRfh" },
+    { 0x0ff08f10, 0x0e608100, "lgn",	"PRfh" },
+    { 0x0ff08f10, 0x0e708100, "exp",	"PRfh" },
+    { 0x0ff08f10, 0x0e808100, "sin",	"PRfh" },
+    { 0x0ff08f10, 0x0e908100, "cos",	"PRfh" },
+    { 0x0ff08f10, 0x0ea08100, "tan",	"PRfh" },
+    { 0x0ff08f10, 0x0eb08100, "asn",	"PRfh" },
+    { 0x0ff08f10, 0x0ec08100, "acs",	"PRfh" },
+    { 0x0ff08f10, 0x0ed08100, "atn",	"PRfh" },
+    { 0x0f008f10, 0x0e008100, "fpuop",	"PRfh" },
+    { 0x0e100f00, 0x0c000100, "stf",	"QLv" },
+    { 0x0e100f00, 0x0c100100, "ldf",	"QLv" },
+    { 0x0ff00f10, 0x0e000110, "flt",	"PRgd" },
+    { 0x0ff00f10, 0x0e100110, "fix",	"PRdh" },
+    { 0x0ff00f10, 0x0e200110, "wfs",	"d" },
+    { 0x0ff00f10, 0x0e300110, "rfs",	"d" },
+    { 0x0ff00f10, 0x0e400110, "wfc",	"d" },
+    { 0x0ff00f10, 0x0e500110, "rfc",	"d" },
+    { 0x0ff0ff10, 0x0e90f110, "cmf",	"PRgh" },
+    { 0x0ff0ff10, 0x0eb0f110, "cnf",	"PRgh" },
+    { 0x0ff0ff10, 0x0ed0f110, "cmfe",	"PRgh" },
+    { 0x0ff0ff10, 0x0ef0f110, "cnfe",	"PRgh" },
+    { 0xff100010, 0xfe000010, "mcr2",	"#z" },
+    { 0x0f100010, 0x0e000010, "mcr",	"#z" },
+    { 0xff100010, 0xfe100010, "mrc2",	"#z" },
+    { 0x0f100010, 0x0e100010, "mrc",	"#z" },
+    { 0xff000010, 0xfe000000, "cdp2",	"#y" },
+    { 0x0f000010, 0x0e000000, "cdp",	"#y" },
+    { 0xfe100090, 0xfc100000, "ldc2",	"L#v" },
+    { 0x0e100090, 0x0c100000, "ldc",	"L#v" },
+    { 0xfe100090, 0xfc000000, "stc2",	"L#v" },
+    { 0x0e100090, 0x0c000000, "stc",	"L#v" },
+    { 0xf550f000, 0xf550f000, "pld",	"ne" },
+    { 0x0ff00ff0, 0x01000050, "qaad",	"dmn" },
+    { 0x0ff00ff0, 0x01400050, "qdaad",	"dmn" },
+    { 0x0ff00ff0, 0x01600050, "qdsub",	"dmn" },
+    { 0x0ff00ff0, 0x01200050, "dsub",	"dmn" },
+    { 0x0ff000f0, 0x01000080, "smlabb",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000a0, "smlatb",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000c0, "smlabt",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000e0, "smlatt",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x01400080, "smlalbb","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000a0, "smlaltb","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000c0, "smlalbt","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000e0, "smlaltt","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x01200080, "smlawb", "nmsd" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x012000a0, "smulwb","nms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x012000c0, "smlawt", "nmsd" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x012000e0, "smulwt","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x01600080, "smulbb","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000a0, "smultb","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000c0, "smulbt","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000e0, "smultt","nms" },   // d & n inverted!!
+    { 0x00000000, 0x00000000, NULL,	NULL }
+};
+
+static char const arm32_insn_conditions[][4] = {
+	"eq", "ne", "cs", "cc",
+	"mi", "pl", "vs", "vc",
+	"hi", "ls", "ge", "lt",
+	"gt", "le", "",   "nv"
+};
+
+static char const insn_block_transfers[][4] = {
+	"da", "ia", "db", "ib"
+};
+
+static char const insn_stack_block_transfers[][4] = {
+	"ed", "ea", "fd", "fa"
+};
+
+static char const op_shifts[][4] = {
+	"lsl", "lsr", "asr", "ror"
+};
+
+static char const insn_fpa_rounding[][2] = {
+	"", "p", "m", "z"
+};
+
+static char const insn_fpa_precision[][2] = {
+	"s", "d", "e", "p"
+};
+
+static char const insn_fpaconstants[][8] = {
+	"0.0", "1.0", "2.0", "3.0",
+	"4.0", "5.0", "0.5", "10.0"
+};
+
+#define insn_condition(x)	arm32_insn_conditions[(x >> 28) & 0x0f]
+#define insn_blktrans(x)	insn_block_transfers[(x >> 23) & 3]
+#define insn_stkblktrans(x)	insn_stack_block_transfers[(3*((x >> 20)&1))^((x >> 23)&3)]
+#define op2_shift(x)		op_shifts[(x >> 5) & 3]
+#define insn_fparnd(x)		insn_fpa_rounding[(x >> 5) & 0x03]
+#define insn_fpaprec(x)		insn_fpa_precision[(((x >> 18) & 2)|(x >> 7)) & 1]
+#define insn_fpaprect(x)	insn_fpa_precision[(((x >> 21) & 2)|(x >> 15)) & 1]
+#define insn_fpaimm(x)		insn_fpaconstants[x & 0x07]
+
+/* Local prototypes */
+static void disasm_register_shift(const disasm_interface_t *di, u_int insn);
+static void disasm_print_reglist(const disasm_interface_t *di, u_int insn);
+static void disasm_insn_ldrstr(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static void disasm_insn_ldrhstrh(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static void disasm_insn_ldcstc(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static u_int disassemble_readword(u_int address);
+static void disassemble_printaddr(u_int address);
+
+u_int
+disasm(const disasm_interface_t *di, u_int loc, int altfmt)
+{
+	const struct arm32_insn *i_ptr = &arm32_i[0];
+
+	u_int insn;
+	int matchp;
+	int branch;
+	char* f_ptr;
+	int fmt;
+
+	fmt = 0;
+	matchp = 0;
+	insn = di->di_readword(loc);
+
+/*	di->di_printf("loc=%08x insn=%08x : ", loc, insn);*/
+
+	while (i_ptr->name) {
+		if ((insn & i_ptr->mask) ==  i_ptr->pattern) {
+			matchp = 1;
+			break;
+		}
+		i_ptr++;
+	}
+
+	if (!matchp) {
+		di->di_printf("und%s\t%08x\n", insn_condition(insn), insn);
+		return(loc + INSN_SIZE);
+	}
+
+	/* If instruction forces condition code, don't print it. */
+	if ((i_ptr->mask & 0xf0000000) == 0xf0000000)
+		di->di_printf("%s", i_ptr->name);
+	else
+		di->di_printf("%s%s", i_ptr->name, insn_condition(insn));
+
+	f_ptr = i_ptr->format;
+
+	/* Insert tab if there are no instruction modifiers */
+
+	if (*(f_ptr) < 'A' || *(f_ptr) > 'Z') {
+		++fmt;
+		di->di_printf("\t");
+	}
+
+	while (*f_ptr) {
+		switch (*f_ptr) {
+		/* 2 - print Operand 2 of a data processing instruction */
+		case '2':
+			if (insn & 0x02000000) {
+				int rotate= ((insn >> 7) & 0x1e);
+
+				di->di_printf("#0x%08x",
+					      (insn & 0xff) << (32 - rotate) |
+					      (insn & 0xff) >> rotate);
+			} else {  
+				disasm_register_shift(di, insn);
+			}
+			break;
+		/* d - destination register (bits 12-15) */
+		case 'd':
+			di->di_printf("r%d", ((insn >> 12) & 0x0f));
+			break;
+		/* D - insert 'p' if Rd is R15 */
+		case 'D':
+			if (((insn >> 12) & 0x0f) == 15)
+				di->di_printf("p");
+			break;
+		/* n - n register (bits 16-19) */
+		case 'n':
+			di->di_printf("r%d", ((insn >> 16) & 0x0f));
+			break;
+		/* s - s register (bits 8-11) */
+		case 's':
+			di->di_printf("r%d", ((insn >> 8) & 0x0f));
+			break;
+		/* o - indirect register rn (bits 16-19) (used by swap) */
+		case 'o':
+			di->di_printf("[r%d]", ((insn >> 16) & 0x0f));
+			break;
+		/* m - m register (bits 0-4) */
+		case 'm':
+			di->di_printf("r%d", ((insn >> 0) & 0x0f));
+			break;
+		/* a - address operand of ldr/str instruction */
+		case 'a':
+			disasm_insn_ldrstr(di, insn, loc);
+			break;
+		/* e - address operand of ldrh/strh instruction */
+		case 'e':
+			disasm_insn_ldrhstrh(di, insn, loc);
+			break;
+		/* l - register list for ldm/stm instruction */
+		case 'l':
+			disasm_print_reglist(di, insn);
+			break;
+		/* f - 1st fp operand (register) (bits 12-14) */
+		case 'f':
+			di->di_printf("f%d", (insn >> 12) & 7);
+			break;
+		/* g - 2nd fp operand (register) (bits 16-18) */
+		case 'g':
+			di->di_printf("f%d", (insn >> 16) & 7);
+			break;
+		/* h - 3rd fp operand (register/immediate) (bits 0-4) */
+		case 'h':
+			if (insn & (1 << 3))
+				di->di_printf("#%s", insn_fpaimm(insn));
+			else
+				di->di_printf("f%d", insn & 7);
+			break;
+		/* j - xtb rotate literal (bits 10-11) */
+		case 'j':
+			di->di_printf("ror #%d", ((insn >> 10) & 3) << 3);
+			break;
+		/* b - branch address */
+		case 'b':
+			branch = ((insn << 2) & 0x03ffffff);
+			if (branch & 0x02000000)
+				branch |= 0xfc000000;
+			di->di_printaddr(loc + 8 + branch);
+			break;
+		/* t - blx address */
+		case 't':
+			branch = ((insn << 2) & 0x03ffffff) |
+			    (insn >> 23 & 0x00000002);
+			if (branch & 0x02000000)
+				branch |= 0xfc000000;
+			di->di_printaddr(loc + 8 + branch);
+			break;
+		/* X - block transfer type */
+		case 'X':
+			di->di_printf("%s", insn_blktrans(insn));
+			break;
+		/* Y - block transfer type (r13 base) */
+		case 'Y':
+			di->di_printf("%s", insn_stkblktrans(insn));
+			break;
+		/* c - comment field bits(0-23) */
+		case 'c':
+			di->di_printf("0x%08x", (insn & 0x00ffffff));
+			break;
+		/* k - breakpoint comment (bits 0-3, 8-19) */
+		case 'k':
+			di->di_printf("0x%04x",
+			    (insn & 0x000fff00) >> 4 | (insn & 0x0000000f));
+			break;
+		/* p - saved or current status register */
+		case 'p':
+			if (insn & 0x00400000)
+				di->di_printf("spsr");
+			else
+				di->di_printf("cpsr");
+			break;
+		/* F - PSR transfer fields */
+		case 'F':
+			di->di_printf("_");
+			if (insn & (1 << 16))
+				di->di_printf("c");
+			if (insn & (1 << 17))
+				di->di_printf("x");
+			if (insn & (1 << 18))
+				di->di_printf("s");
+			if (insn & (1 << 19))
+				di->di_printf("f");
+			break;
+		/* B - byte transfer flag */
+		case 'B':
+			if (insn & 0x00400000)
+				di->di_printf("b");
+			break;
+		/* L - co-processor transfer size */
+		case 'L':
+			if (insn & (1 << 22))
+				di->di_printf("l");
+			break;
+		/* S - set status flag */
+		case 'S':
+			if (insn & 0x00100000)
+				di->di_printf("s");
+			break;
+		/* P - fp precision */
+		case 'P':
+			di->di_printf("%s", insn_fpaprec(insn));
+			break;
+		/* Q - fp precision (for ldf/stf) */
+		case 'Q':
+			break;
+		/* R - fp rounding */
+		case 'R':
+			di->di_printf("%s", insn_fparnd(insn));
+			break;
+		/* W - writeback flag */
+		case 'W':
+			if (insn & (1 << 21))
+				di->di_printf("!");
+			break;
+		/* # - co-processor number */
+		case '#':
+			di->di_printf("p%d", (insn >> 8) & 0x0f);
+			break;
+		/* v - co-processor data transfer registers+addressing mode */
+		case 'v':
+			disasm_insn_ldcstc(di, insn, loc);
+			break;
+		/* x - instruction in hex */
+		case 'x':
+			di->di_printf("0x%08x", insn);
+			break;
+		/* y - co-processor data processing registers */
+		case 'y':
+			di->di_printf("%d, ", (insn >> 20) & 0x0f);
+
+			di->di_printf("c%d, c%d, c%d", (insn >> 12) & 0x0f,
+			    (insn >> 16) & 0x0f, insn & 0x0f);
+
+			di->di_printf(", %d", (insn >> 5) & 0x07);
+			break;
+		/* z - co-processor register transfer registers */
+		case 'z':
+			di->di_printf("%d, ", (insn >> 21) & 0x07);
+			di->di_printf("r%d, c%d, c%d, %d",
+			    (insn >> 12) & 0x0f, (insn >> 16) & 0x0f,
+			    insn & 0x0f, (insn >> 5) & 0x07);
+
+/*			if (((insn >> 5) & 0x07) != 0)
+				di->di_printf(", %d", (insn >> 5) & 0x07);*/
+			break;
+		default:
+			di->di_printf("[%c - unknown]", *f_ptr);
+			break;
+		}
+		if (*(f_ptr+1) >= 'A' && *(f_ptr+1) <= 'Z')
+			++f_ptr;
+		else if (*(++f_ptr)) {
+			++fmt;
+			if (fmt == 1)
+				di->di_printf("\t");
+			else
+				di->di_printf(", ");
+		}
+	};
+
+	di->di_printf("\n");
+
+	return(loc + INSN_SIZE);
+}
+
+
+static void
+disasm_register_shift(const disasm_interface_t *di, u_int insn)
+{
+	di->di_printf("r%d", (insn & 0x0f));
+	if ((insn & 0x00000ff0) == 0)
+		;
+	else if ((insn & 0x00000ff0) == 0x00000060)
+		di->di_printf(", rrx");
+	else {
+		if (insn & 0x10)
+			di->di_printf(", %s r%d", op2_shift(insn),
+			    (insn >> 8) & 0x0f);
+		else
+			di->di_printf(", %s #%d", op2_shift(insn),
+			    (insn >> 7) & 0x1f);
+	}
+}
+
+
+static void
+disasm_print_reglist(const disasm_interface_t *di, u_int insn)
+{
+	int loop;
+	int start;
+	int comma;
+
+	di->di_printf("{");
+	start = -1;
+	comma = 0;
+
+	for (loop = 0; loop < 17; ++loop) {
+		if (start != -1) {
+			if (loop == 16 || !(insn & (1 << loop))) {
+				if (comma)
+					di->di_printf(", ");
+				else
+					comma = 1;
+        			if (start == loop - 1)
+        				di->di_printf("r%d", start);
+        			else
+        				di->di_printf("r%d-r%d", start, loop - 1);
+        			start = -1;
+        		}
+        	} else {
+        		if (insn & (1 << loop))
+        			start = loop;
+        	}
+        }
+	di->di_printf("}");
+
+	if (insn & (1 << 22))
+		di->di_printf("^");
+}
+
+static void
+disasm_insn_ldrstr(const disasm_interface_t *di, u_int insn, u_int loc)
+{
+	int offset;
+
+	offset = insn & 0xfff;
+	if ((insn & 0x032f0000) == 0x010f0000) {
+		/* rA = pc, immediate index */
+		if (insn & 0x00800000)
+			loc += offset;
+		else
+			loc -= offset;
+		di->di_printaddr(loc + 8);
+ 	} else {
+		di->di_printf("[r%d", (insn >> 16) & 0x0f);
+		if ((insn & 0x03000fff) != 0x01000000) {
+			di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+			if (!(insn & 0x00800000))
+				di->di_printf("-");
+			if (insn & (1 << 25))
+				disasm_register_shift(di, insn);
+			else
+				di->di_printf("#0x%03x", offset);
+		}
+		if (insn & (1 << 24))
+			di->di_printf("]");
+	}
+}
+
+static void
+disasm_insn_ldrhstrh(const disasm_interface_t *di, u_int insn, u_int loc)
+{
+	int offset;
+
+	offset = ((insn & 0xf00) >> 4) | (insn & 0xf);
+	if ((insn & 0x004f0000) == 0x004f0000) {
+		/* rA = pc, immediate index */
+		if (insn & 0x00800000)
+			loc += offset;
+		else
+			loc -= offset;
+		di->di_printaddr(loc + 8);
+ 	} else {
+		di->di_printf("[r%d", (insn >> 16) & 0x0f);
+		if ((insn & 0x01400f0f) != 0x01400000) {
+			di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+			if (!(insn & 0x00800000))
+				di->di_printf("-");
+			if (insn & (1 << 22))
+				di->di_printf("#0x%02x", offset);
+			else
+				di->di_printf("r%d", (insn & 0x0f));
+		}
+		if (insn & (1 << 24))
+			di->di_printf("]");
+	}
+}
+
+static void
+disasm_insn_ldcstc(const disasm_interface_t *di, u_int insn, u_int loc)
+{
+	if (((insn >> 8) & 0xf) == 1)
+		di->di_printf("f%d, ", (insn >> 12) & 0x07);
+	else
+		di->di_printf("c%d, ", (insn >> 12) & 0x0f);
+
+	di->di_printf("[r%d", (insn >> 16) & 0x0f);
+
+	di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+
+	if (!(insn & (1 << 23)))
+		di->di_printf("-");
+
+	di->di_printf("#0x%03x", (insn & 0xff) << 2);
+
+	if (insn & (1 << 24))
+		di->di_printf("]");
+
+	if (insn & (1 << 21))
+		di->di_printf("!");
+}
+
+static u_int
+disassemble_readword(u_int address)
+{
+	return(*((u_int *)address));
+}
+
+static void
+disassemble_printaddr(u_int address)
+{
+	printf("0x%08x", address);
+}
+
+static const disasm_interface_t disassemble_di = {
+	disassemble_readword, disassemble_printaddr, printf
+};
+
+void
+disassemble(u_int address)
+{
+
+	(void)disasm(&disassemble_di, address, 0);
+}
+
+/* End of disassem.c */
@@ -0,0 +1,65 @@
+/*	$NetBSD: disassem.h,v 1.4 2001/03/04 04:15:58 matt Exp $	*/
+
+/*-
+ * Copyright (c) 1997 Mark Brinicombe.
+ * Copyright (c) 1997 Causality Limited.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Mark Brinicombe.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Define the interface structure required by the disassembler.
+ *
+ * $FreeBSD: /repoman/r/ncvs/src/sys/arm/include/disassem.h,v 1.2 2005/01/05 21:58:48 imp Exp $
+ */
+
+#ifndef ANDROID_MACHINE_DISASSEM_H
+#define ANDROID_MACHINE_DISASSEM_H
+
+#include <sys/types.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+	u_int	(*di_readword)(u_int);
+	void	(*di_printaddr)(u_int);	
+	void	(*di_printf)(const char *, ...);
+} disasm_interface_t;
+
+/* Prototypes for callable functions */
+
+u_int disasm(const disasm_interface_t *, u_int, int);
+void disassemble(u_int);
+
+#if __cplusplus
+}
+#endif
+
+#endif /* !ANDROID_MACHINE_DISASSEM_H */
@@ -0,0 +1,378 @@
+/* libs/pixelflinger/codeflinger/load_store.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <assert.h>
+#include <stdio.h>
+#include <cutils/log.h>
+
+#include "codeflinger/GGLAssembler.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+void GGLAssembler::store(const pointer_t& addr, const pixel_t& s, uint32_t flags)
+{    
+    const int bits = addr.size;
+    const int inc = (flags & WRITE_BACK)?1:0;
+    switch (bits) {
+    case 32:
+        if (inc)    STR(AL, s.reg, addr.reg, immed12_post(4));
+        else        STR(AL, s.reg, addr.reg);
+        break;
+    case 24:
+        // 24 bits formats are a little special and used only for RGB
+        // 0x00BBGGRR is unpacked as R,G,B
+        STRB(AL, s.reg, addr.reg, immed12_pre(0));
+        MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 8));
+        STRB(AL, s.reg, addr.reg, immed12_pre(1));
+        MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 8));
+        STRB(AL, s.reg, addr.reg, immed12_pre(2));
+        if (!(s.flags & CORRUPTIBLE)) {
+            MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 16));
+        }
+        if (inc)
+            ADD(AL, 0, addr.reg, addr.reg, imm(3));
+        break;
+    case 16:
+        if (inc)    STRH(AL, s.reg, addr.reg, immed8_post(2));
+        else        STRH(AL, s.reg, addr.reg);
+        break;
+    case  8:
+        if (inc)    STRB(AL, s.reg, addr.reg, immed12_post(1));
+        else        STRB(AL, s.reg, addr.reg);
+        break;
+    }
+}
+
+void GGLAssembler::load(const pointer_t& addr, const pixel_t& s, uint32_t flags)
+{    
+    Scratch scratches(registerFile());    
+    int s0;
+
+    const int bits = addr.size;
+    const int inc = (flags & WRITE_BACK)?1:0;
+    switch (bits) {
+    case 32:
+        if (inc)    LDR(AL, s.reg, addr.reg, immed12_post(4));
+        else        LDR(AL, s.reg, addr.reg);
+        break;
+    case 24:
+        // 24 bits formats are a little special and used only for RGB
+        // R,G,B is packed as 0x00BBGGRR 
+        s0 = scratches.obtain();
+        if (s.reg != addr.reg) {
+            LDRB(AL, s.reg, addr.reg, immed12_pre(0));      // R
+            LDRB(AL, s0, addr.reg, immed12_pre(1));         // G
+            ORR(AL, 0, s.reg, s.reg, reg_imm(s0, LSL, 8));
+            LDRB(AL, s0, addr.reg, immed12_pre(2));         // B
+            ORR(AL, 0, s.reg, s.reg, reg_imm(s0, LSL, 16));
+        } else {
+            int s1 = scratches.obtain();
+            LDRB(AL, s1, addr.reg, immed12_pre(0));         // R
+            LDRB(AL, s0, addr.reg, immed12_pre(1));         // G
+            ORR(AL, 0, s1, s1, reg_imm(s0, LSL, 8));
+            LDRB(AL, s0, addr.reg, immed12_pre(2));         // B
+            ORR(AL, 0, s.reg, s1, reg_imm(s0, LSL, 16));
+        }
+        if (inc)
+            ADD(AL, 0, addr.reg, addr.reg, imm(3));
+        break;        
+    case 16:
+        if (inc)    LDRH(AL, s.reg, addr.reg, immed8_post(2));
+        else        LDRH(AL, s.reg, addr.reg);
+        break;
+    case  8:
+        if (inc)    LDRB(AL, s.reg, addr.reg, immed12_post(1));
+        else        LDRB(AL, s.reg, addr.reg);
+        break;
+    }
+}
+
+void GGLAssembler::extract(integer_t& d, int s, int h, int l, int bits)
+{
+    const int maskLen = h-l;
+
+    assert(maskLen<=8);
+    assert(h);
+    
+    if (h != bits) {
+        const int mask = ((1<<maskLen)-1) << l;
+        if (isValidImmediate(mask)) {
+            AND(AL, 0, d.reg, s, imm(mask));    // component = packed & mask;
+        } else if (isValidImmediate(~mask)) {
+            BIC(AL, 0, d.reg, s, imm(~mask));   // component = packed & mask;
+        } else {
+            MOV(AL, 0, d.reg, reg_imm(s, LSL, 32-h));
+            l += 32-h;
+            h = 32;
+        }
+        s = d.reg;
+    }
+    
+    if (l) {
+        MOV(AL, 0, d.reg, reg_imm(s, LSR, l));  // component = packed >> l;
+        s = d.reg;
+    }
+    
+    if (s != d.reg) {
+        MOV(AL, 0, d.reg, s);
+    }
+
+    d.s = maskLen;
+}
+
+void GGLAssembler::extract(integer_t& d, const pixel_t& s, int component)
+{
+    extract(d,  s.reg,
+                s.format.c[component].h,
+                s.format.c[component].l,
+                s.size());
+}
+
+void GGLAssembler::extract(component_t& d, const pixel_t& s, int component)
+{
+    integer_t r(d.reg, 32, d.flags);
+    extract(r,  s.reg,
+                s.format.c[component].h,
+                s.format.c[component].l,
+                s.size());
+    d = component_t(r);
+}
+
+
+void GGLAssembler::expand(integer_t& d, const component_t& s, int dbits)
+{
+    if (s.l || (s.flags & CLEAR_HI)) {
+        extract(d, s.reg, s.h, s.l, 32);
+        expand(d, d, dbits);
+    } else {
+        expand(d, integer_t(s.reg, s.size(), s.flags), dbits);
+    }
+}
+
+void GGLAssembler::expand(component_t& d, const component_t& s, int dbits)
+{
+    integer_t r(d.reg, 32, d.flags);
+    expand(r, s, dbits);
+    d = component_t(r);
+}
+
+void GGLAssembler::expand(integer_t& dst, const integer_t& src, int dbits)
+{
+    assert(src.size());
+
+    int sbits = src.size();
+    int s = src.reg;
+    int d = dst.reg;
+
+    // be sure to set 'dst' after we read 'src' as they may be identical
+    dst.s = dbits;
+    dst.flags = 0;
+
+    if (dbits<=sbits) {
+        if (s != d) {
+            MOV(AL, 0, d, s);
+        }
+        return;
+    }
+
+    if (sbits == 1) {
+        RSB(AL, 0, d, s, reg_imm(s, LSL, dbits));
+            // d = (s<<dbits) - s;
+        return;
+    }
+
+    if (dbits % sbits) {
+        MOV(AL, 0, d, reg_imm(s, LSL, dbits-sbits));
+            // d = s << (dbits-sbits);
+        dbits -= sbits;
+        do {
+            ORR(AL, 0, d, d, reg_imm(d, LSR, sbits));
+                // d |= d >> sbits;
+            dbits -= sbits;
+            sbits *= 2;
+        } while(dbits>0);
+        return;
+    }
+    
+    dbits -= sbits;
+    do {
+        ORR(AL, 0, d, s, reg_imm(s, LSL, sbits));
+            // d |= d<<sbits;
+        s = d;        
+        dbits -= sbits;
+        if (sbits*2 < dbits) {
+            sbits *= 2;
+        }
+    } while(dbits>0);
+}
+
+void GGLAssembler::downshift(
+        pixel_t& d, int component, component_t s, const reg_t& dither)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    Scratch scratches(registerFile());
+
+    int sh = s.h;
+    int sl = s.l;
+    int maskHiBits = (sh!=32) ? ((s.flags & CLEAR_HI)?1:0) : 0;
+    int maskLoBits = (sl!=0)  ? ((s.flags & CLEAR_LO)?1:0) : 0;
+    int sbits = sh - sl;
+
+    int dh = d.format.c[component].h;
+    int dl = d.format.c[component].l;
+    int dbits = dh - dl;
+    int dithering = 0;
+    
+    LOGE_IF(sbits<dbits, "sbits (%d) < dbits (%d) in downshift", sbits, dbits);
+
+    if (sbits>dbits) {
+        // see if we need to dither
+        dithering = mDithering;
+    }
+    
+    int ireg = d.reg;
+    if (!(d.flags & FIRST)) {
+        if (s.flags & CORRUPTIBLE)  {
+            ireg = s.reg;
+        } else {
+            ireg = scratches.obtain();
+        }
+    }
+    d.flags &= ~FIRST;
+
+    if (maskHiBits) {
+        // we need to mask the high bits (and possibly the lowbits too)
+        // and we might be able to use immediate mask.
+        if (!dithering) {
+            // we don't do this if we only have maskLoBits because we can
+            // do it more efficiently below (in the case where dl=0)
+            const int offset = sh - dbits;
+            if (dbits<=8 && offset >= 0) {
+                const uint32_t mask = ((1<<dbits)-1) << offset;
+                if (isValidImmediate(mask) || isValidImmediate(~mask)) {
+                    build_and_immediate(ireg, s.reg, mask, 32);
+                    sl = offset;
+                    s.reg = ireg; 
+                    sbits = dbits;
+                    maskLoBits = maskHiBits = 0;
+                }
+            }
+        } else {
+            // in the dithering case though, we need to preserve the lower bits
+            const uint32_t mask = ((1<<sbits)-1) << sl;
+            if (isValidImmediate(mask) || isValidImmediate(~mask)) {
+                build_and_immediate(ireg, s.reg, mask, 32);
+                s.reg = ireg; 
+                maskLoBits = maskHiBits = 0;
+            }
+        }
+    }
+
+    // XXX: we could special case (maskHiBits & !maskLoBits)
+    // like we do for maskLoBits below, but it happens very rarely
+    // that we have maskHiBits only and the conditions necessary to lead
+    // to better code (like doing d |= s << 24)
+
+    if (maskHiBits) {
+        MOV(AL, 0, ireg, reg_imm(s.reg, LSL, 32-sh));
+        sl += 32-sh;
+        sh = 32;
+        s.reg = ireg;
+        maskHiBits = 0;
+    }
+
+    //	Downsampling should be performed as follows:
+    //  V * ((1<<dbits)-1) / ((1<<sbits)-1)
+    //	V * [(1<<dbits)/((1<<sbits)-1)	-	1/((1<<sbits)-1)]
+    //	V * [1/((1<<sbits)-1)>>dbits	-	1/((1<<sbits)-1)]
+    //	V/((1<<(sbits-dbits))-(1>>dbits))	-	(V>>sbits)/((1<<sbits)-1)>>sbits
+    //	V/((1<<(sbits-dbits))-(1>>dbits))	-	(V>>sbits)/(1-(1>>sbits))
+    //
+    //	By approximating (1>>dbits) and (1>>sbits) to 0:
+    //
+    //		V>>(sbits-dbits)	-	V>>sbits
+    //
+	//  A good approximation is V>>(sbits-dbits),
+    //  but better one (needed for dithering) is:
+    //
+    //		(V>>(sbits-dbits)<<sbits	-	V)>>sbits
+    //		(V<<dbits	-	V)>>sbits
+    //		(V	-	V>>dbits)>>(sbits-dbits)
+
+    // Dithering is done here
+    if (dithering) {
+        comment("dithering");
+        if (sl) {
+            MOV(AL, 0, ireg, reg_imm(s.reg, LSR, sl));
+            sh -= sl;
+            sl = 0;
+            s.reg = ireg; 
+        }
+        // scaling (V-V>>dbits)
+        SUB(AL, 0, ireg, s.reg, reg_imm(s.reg, LSR, dbits));
+        const int shift = (GGL_DITHER_BITS - (sbits-dbits));
+        if (shift>0)        ADD(AL, 0, ireg, ireg, reg_imm(dither.reg, LSR, shift));
+        else if (shift<0)   ADD(AL, 0, ireg, ireg, reg_imm(dither.reg, LSL,-shift));
+        else                ADD(AL, 0, ireg, ireg, dither.reg);
+        s.reg = ireg; 
+    }
+
+    if ((maskLoBits|dithering) && (sh > dbits)) {
+        int shift = sh-dbits;
+        if (dl) {
+            MOV(AL, 0, ireg, reg_imm(s.reg, LSR, shift));
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(ireg, LSL, dl));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(ireg, LSL, dl));
+            }
+        } else {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSR, shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSR, shift));
+            }
+        }
+    } else {
+        int shift = sh-dh;
+        if (shift>0) {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSR, shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSR, shift));
+            }
+        } else if (shift<0) {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSL, -shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSL, -shift));
+            }
+        } else {
+            if (ireg == d.reg) {
+                if (s.reg != d.reg) {
+                    MOV(AL, 0, d.reg, s.reg);
+                }
+            } else {
+                ORR(AL, 0, d.reg, d.reg, s.reg);
+            }
+        }
+    }
+}
+
+}; // namespace android