// -*- c++ -*-
//
// arm.cc:
//
#include <cassert>
#include <iostream>
#include "arm.h"
#include "breakcode.h"

using namespace std;

// enable the special code for non-pipelined behavior
#define	NON_PIPELINED_MODE

//
// constructor
//
arm::arm()
{
}

//
// destructor
//
arm::~arm()
{
}

//
// arm class member functions
//

void
arm::connect_memory(memory *mem)
{
    this->mem = mem;
}

void
arm::connect_icache(cache *icache)
{
    this->icache = icache;
}

void
arm::connect_dcache(cache *dcache)
{
    this->dcache = dcache;
}

void
arm::reset()
{
    running_f = true;
    multicycle_exe_f = false;
    stall_cond_f = false;

    Regs.reset();
}

void
arm::set_pc(sim_addr pc)
{
    PC = pc;
}

void
arm::set_lr(sim_addr link_addr)
{
    Regs[14] = link_addr;
}

void
arm::set_sp(sim_addr sp)
{
    Regs[13] = sp;
}

void
arm::set_reg(int regno, sim_word val)
{
    Regs[regno] = val;
}

sim_addr
arm::get_pc()
{
    return PC;
}

sim_addr
arm::get_lr()
{
    return Regs[14];
}

sim_addr
arm::get_sp()
{
    return Regs[13];
}

sim_word
arm::get_reg(int regno)
{
    return Regs[regno];
}


sim_word
arm::LSL(sim_word v, int n, bool &cy)
{
    cy = (v & (1 << (31 - n)))? true: false;
    return v << n;
}

sim_word
arm::LSR(sim_word v, int n, bool &cy)
{
    cy = ((v >> n) & 1)? true: false;
    return v >> n;
}

sim_word
arm::ASR(sim_word v, int n, bool &cy)
{
    cy = (((signed)v >> n) & 1)? true: false;
    return (signed)v >> n;
}

sim_word
arm::ROR(sim_word v, int n, bool &cy)
{
    n %= 32;
    cy = ((v >> n) & 1)? true: false;
    return (v >> n) | (v << (32-n));
}

sim_word
arm::Operand2_val(arm_inst IR, sim_word B, sim_word C, bool flag_set)
{
    if (IR.I())
        return Operand2_imm(IR, B, C, flag_set);
    else
        return Operand2_reg(IR, B, C, flag_set);
}

sim_word
arm::Operand2_reg(arm_inst IR, sim_word Rm, sim_word Rs, bool flag_set)
{
    int shmnt = IR.shRm()? Rs: IR.shAmnt();
    sim_word v;
    bool cy;

#if 0
    switch (IR.shType()) {
    case 0: v = LSL(Rm, shmnt, cy); break;
    case 1: v = LSR(Rm, shmnt, cy); break;
    case 2: v = ASR(Rm, shmnt, cy); break;
    case 3: v = ROR(Rm, shmnt, cy); break;
    default: assert(0); break;
    }
#else
    sim_word (arm::*func[])(sim_word, int, bool &) = {
	&arm::LSL, &arm::LSR, &arm::ASR, &arm::ROR
    };
    v = (this->*func[IR.shType()])(Rm, shmnt, cy);
#endif

    if (flag_set &&
	(IR.shType() != 0 || shmnt != 0)) // carry is preserved when 'LSL #0'
        CPSR.set_C(cy);

    return v;
}
    
sim_word
arm::Operand2_imm(arm_inst IR, sim_word uimm8, sim_word rot, bool flag_set)
{
    bool cy;
    sim_word v = ROR(uimm8, 2*rot, cy);
    if (flag_set)
        CPSR.set_C(cy);
	
    return v;
}


void
arm::IF()
{
    cout << "---------- IF ----------" << endl;;
    IADDR = PC;
    icache->request_read_word(PC);
    PC += 4;

    cout << "IADDR=" << hex << IADDR << endl;
    cout << "PC=" << hex << PC << endl;
}

void
arm::ID()
{
    cout << "---------- ID ----------" << endl;;
    if (icache->is_done()) {
        sim_word v;
        icache->reply_read_word(IADDR, v);

        IR = v;                 // instruction is fed into IR and is decoded
    }
    else {
        IR = 0x00000000;        // IR set to zero
	cout << "*** ID stalled ***" << endl;
        set_stall_condition();
        goto ID_done;
    }

    //
    // register operand setup
    //
    A = IR.has_src_operand_1()? Regs[IR.regno_src_operand_1()]: 0;
    B = IR.has_src_operand_2()? Regs[IR.regno_src_operand_2()]: 0;
    C = IR.has_src_operand_3()? Regs[IR.regno_src_operand_3()]: 0;
#ifdef	NON_PIPELINED_MODE
    // adjust for PC
    if (IR.regno_src_operand_1() == 15) A += 4;
    if (IR.regno_src_operand_2() == 15) B += 4;
    if (IR.regno_src_operand_3() == 15) C += 4;

    // for debug
    if (IR.regno_src_operand_1() == 15)
        cout << " A is adjusted to = " << hex << A << endl;
    if (IR.regno_src_operand_2() == 15)
        cout << " B is adjusted to = " << hex << B << endl;
    if (IR.regno_src_operand_3() == 15)
        cout << " C is adjusted to = " << hex << C << endl;
#endif

    //
    // immediate (or other) operand setup
    //
    switch (IR.code()) {
    case ARM_INST_B:
    case ARM_INST_BL:
	B = IR.Offset24();	// offset address (24bit immediate)
	C = PC - 4;             // R15(PC) might be stored into R14(LR) later
#ifdef	NON_PIPELINED_MODE
	C += 4;
#endif
	break;

    case ARM_INST_AND_I:
    case ARM_INST_EOR_I:
    case ARM_INST_SUB_I:
    case ARM_INST_RSB_I:
    case ARM_INST_ADD_I:
    case ARM_INST_ADC_I:
    case ARM_INST_SBC_I:
    case ARM_INST_RSC_I:
    case ARM_INST_TST_I:
    case ARM_INST_TEQ_I:
    case ARM_INST_CMP_I:
    case ARM_INST_CMN_I:
    case ARM_INST_ORR_I:
    case ARM_INST_MOV_I:
    case ARM_INST_BIC_I:
    case ARM_INST_MVN_I:
        B = IR.UImm8();         // unsigned 8bit immediate value
        C = IR.Rs();            // shift applied to UImm8
	break;

    case ARM_INST_LDRH:
    case ARM_INST_LDRSB:
    case ARM_INST_LDRSH:
    case ARM_INST_STRH:
    case ARM_INST_STRSB:
    case ARM_INST_STRSH:
        B = IR.UOffset8HL();	// offset address (high/low nibble)
        break;

    case ARM_INST_LDR:
    case ARM_INST_LDRB:
    case ARM_INST_STR:
    case ARM_INST_STRB:
	B = IR.UOffset12();	// offset address (unsigned 12bit immediate value)
	break;

    case ARM_INST_LDM:
    case ARM_INST_STM:
	B = IR.RegList();
	break;
    }

ID_done:
    cout << "IR=" << hex << IR << endl;
    cout << "inst_code=" << hex << IR.code() << endl;

    cout << "A=" << hex << A << endl;
    cout << "B=" << hex << B << endl;
    cout << "C=" << hex << C << endl;

}

int
arm::sign(sim_word v)
{
    return v >> 31;
}

bool
arm::overflow_S(sim_word d, sim_word s, sim_word t)
{
    // overflow check (signed)
    return (sign(d) && !sign(s) && !sign(t)) ||
        (!sign(d) && sign(s) && sign(t))? true: false;
}

bool
arm::overflow_U(sim_word d, sim_word s, sim_word t)
{
    // overflow check (unsigned)
    return ((sign(d) && sign(s) && sign(t)) ||
            (!sign(d) && (sign(s) || sign(t))))? true: false;
}

bool
arm::check_condition_codes(arm_inst IR, arm_psr CPSR)
{
    bool cond;

    switch (IR.Cond()) {
    case 0:                     // EQ (Z set)
        cond = CPSR.Z();
        break;
    case 1:                     // ZE (Z clear)
        cond = !CPSR.Z();
        break;
    case 2:                     // CS (C set)
        cond = CPSR.C();
        break;
    case 3:                     // CC (C clear)
        cond = !CPSR.C();
        break;
    case 4:                     // MI (N set)
        cond = CPSR.N();
        break;
    case 5:                     // PL (N clear)
        cond = !CPSR.N();
        break;
    case 6:                     // VS (V set)
        cond = CPSR.V();
        break;
    case 7:                     // VC (V clear)
        cond = !CPSR.N();
        break;
    case 8:                     // HI (C set and Z clear)
        cond = CPSR.C() && !CPSR.Z();
        break;
    case 9:                     // LS (C clear or Z set)
        cond = !CPSR.C() || CPSR.Z();
        break;
    case 10:                    // GE (N equals V)
        cond = CPSR.N() == CPSR.V();
        break;
    case 11:                    // LT (N not equal to V)
        cond = CPSR.N() != CPSR.V();
        break;
    case 12:                    // GT (Z clear && (N equals V)
        cond = !CPSR.Z() && (CPSR.N() == CPSR.V());
        break;
    case 13:                    // LE (Z set || (N not equal to V)
        cond = CPSR.Z() || (CPSR.N() != CPSR.V());
        break;
    case 14:                    // AL (always)
        cond = true;
        break;

    default:
        cerr << "undefined condition field IR.Cond=" << hex << IR.Cond() << endl;
        break;
    }

    return cond;
}


int
arm::count_bits(unsigned int n)
{
    // fast bit-counting algorithm
    n = (n & 0x55555555) + (n >> 1 & 0x55555555);
    n = (n & 0x33333333) + (n >> 2 & 0x33333333);
    n = (n & 0x0f0f0f0f) + (n >> 4 & 0x0f0f0f0f);
    n = (n & 0x00ff00ff) + (n >> 8 & 0x00ff00ff);
    return (n & 0x0000ffff) + (n >>16 & 0x0000ffff);
}

int
arm::next_regno(unsigned int reglist, bool ascending)
{
    if (ascending)
	return count_bits((reglist & -reglist) - 1);
    else {
	unsigned int x = reglist;
	x |= (x >> 1);
	x |= (x >> 2);
	x |= (x >> 4);
	x |= (x >> 8);
	x |= (x >> 16);

	return count_bits(((x+1) & -(x+1)) - 1) - 1;
    }
}


void
arm::EX()
{
    cout << "---------- EX ----------" << endl;;
    
    // check condition codes (update CondOK)
    CondOK = check_condition_codes(IR, CPSR);
    if (!CondOK) {
        cout << "Flag(NZCV)="
             << CPSR.N() << CPSR.Z() << CPSR.C() << CPSR.V() << endl;
        cout << "Cond=" << hex << IR.Cond() << endl;
        cout << "(ignored instruction)" << endl;

        goto EX_done;
    }

    ALUOutput = 0;                   // clear ALU output
    ALUOutput2 = 0;                   // clear ALU output 2

    //
    // dispatch
    //
    switch (IR.code()) {    // dispatch using decoded instruction type
        //
        // Branch (control transfer)
        //
    case ARM_INST_B:
    case ARM_INST_BL:
        ALUOutput = A + (B << 2);
        if (IR.L())
            ALUOutput2 = C;
        break;

    case ARM_INST_BX:
        ALUOutput = A;
        if ((A & 1)) {          // check Thumb mode transfer
            cout << "NPC=" << hex << A
                 << " (LSB of NPC is 1)" << endl;
            cout << "THUMB state is not currently implemented" << endl;

            running_f = false;
        }
        break;

        //
        // Data Processing
        //
    case ARM_INST_TST:              // TST (== AND without write-back)
    case ARM_INST_TST_I:            // TST (== AND without write-back)
        assert(IR.S());
        /* PASS THROUGH */
    case ARM_INST_AND:              // AND
    case ARM_INST_AND_I:            // AND
        if (IR.S()) {           // set condition codes
            ALUOutput = A & Operand2_val(IR, B, C, true);

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            // C flag set by shifter
            // V flag unchanged
        }
        else {
            ALUOutput = A & Operand2_val(IR, B, C, false);
        }
        break;

    case ARM_INST_TEQ:              // TEQ (== EOR without write-back)
    case ARM_INST_TEQ_I:            // TEQ (== EOR without write-back)
        assert(IR.S());
        /* PASS THROUGH */
    case ARM_INST_EOR:              // EOR
    case ARM_INST_EOR_I:            // EOR
        if (IR.S()) {           // set condition codes
            ALUOutput = A ^ Operand2_val(IR, B, C, true);

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            // C flag set by shifter
            // V flag unchanged
        }
        else {
            ALUOutput = A ^ Operand2_val(IR, B, C, false);
        }
        break;

    case ARM_INST_CMP:              // CMP (== SUB without write-back)
    case ARM_INST_CMP_I:            // CMP (== SUB without write-back)
        assert(IR.S());
        /* PASS THROUGH */
    case ARM_INST_SUB:              // SUB
    case ARM_INST_SUB_I:            // SUB
        if (IR.S()) {           // set condition codes
            sim_word oprnd2 = Operand2_val(IR, B, C, true);
            ALUOutput = A - oprnd2;

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            CPSR.set_C(overflow_U(ALUOutput, A, -oprnd2));
            CPSR.set_V(overflow_S(ALUOutput, A, -oprnd2));
        }
        else {
            ALUOutput = A - Operand2_val(IR, B, C, false);
        }
        break;

    case ARM_INST_RSB:              // RSB
    case ARM_INST_RSB_I:            // RSB
        if (IR.S()) {           // set condition codes
            sim_word oprnd2 = Operand2_val(IR, B, C, true);
            ALUOutput = oprnd2 - A;

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            CPSR.set_C(overflow_U(ALUOutput, oprnd2, -A));
            CPSR.set_V(overflow_S(ALUOutput, oprnd2, -A));
        }
        else {
            ALUOutput = Operand2_val(IR, B, C, false) - A;
        }
        break;

    case ARM_INST_CMN:              // CMN (== ADD without write-back)
    case ARM_INST_CMN_I:            // CMN (== ADD without write-back)
        assert(IR.S());
        /* PASS THROUGH */
    case ARM_INST_ADD:              // ADD
    case ARM_INST_ADD_I:            // ADD
        if (IR.S()) {           // set condition codes
            sim_word oprnd2 = Operand2_val(IR, B, C, true);
            ALUOutput = A + oprnd2;

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            CPSR.set_C(overflow_U(ALUOutput, A, oprnd2));
            CPSR.set_V(overflow_S(ALUOutput, A, oprnd2));
        }
        else {
            ALUOutput = A + Operand2_val(IR, B, C, false);
        }
        break;

    case ARM_INST_ADC:              // ADC
    case ARM_INST_ADC_I:            // ADC
        if (IR.S()) {           // set condition codes
            sim_word oprnd2 = Operand2_val(IR, B, C, true);
            ALUOutput = A + oprnd2 + carry();

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            CPSR.set_C(overflow_U(ALUOutput, A, oprnd2 + carry()));
            CPSR.set_V(overflow_S(ALUOutput, A, oprnd2 + carry()));
        }
        else {
            ALUOutput = A + Operand2_val(IR, B, C, false) + carry();
        }
        break;

    case ARM_INST_SBC:              // SBC
    case ARM_INST_SBC_I:            // SBC
        if (IR.S()) {           // set condition codes
            sim_word oprnd2 = Operand2_val(IR, B, C, true);
            ALUOutput = A - oprnd2 + carry()-1;

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            CPSR.set_C(overflow_U(ALUOutput, A, -(oprnd2-carry()+1)));
            CPSR.set_V(overflow_S(ALUOutput, A, -(oprnd2-carry()+1)));
        }
        else {
            ALUOutput = A - Operand2_val(IR, B, C, false) + carry()-1;
        }
        break;

    case ARM_INST_RSC:              // RSC
    case ARM_INST_RSC_I:            // RSC
        if (IR.S()) {           // set condition codes
            sim_word oprnd2 = Operand2_val(IR, B, C, true);
            ALUOutput = oprnd2 - A + carry()-1;

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            CPSR.set_C(overflow_U(ALUOutput, oprnd2, -(A-carry()+1)));
            CPSR.set_V(overflow_S(ALUOutput, oprnd2, -(A-carry()+1)));
        }
        else {
            ALUOutput = Operand2_val(IR, B, C, false) - A + carry()-1;
        }
        break;

    case ARM_INST_ORR:              // ORR
    case ARM_INST_ORR_I:            // ORR
        if (IR.S()) {           // set condition codes
            ALUOutput = A | Operand2_val(IR, B, C, true);

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            // C flag set by shifter
            // V flag unchanged
        }
        else {
            ALUOutput = A | Operand2_val(IR, B, C, false);
        }
        break;

    case ARM_INST_MOV:              // MOV
    case ARM_INST_MOV_I:            // MOV
        if (IR.S()) {           // set condition codes
            ALUOutput = Operand2_val(IR, B, C, true);

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            // C flag set by shifter
            // V flag unchanged
        }
        else {
            ALUOutput = Operand2_val(IR, B, C, false);
        }
        break;

    case ARM_INST_BIC:              // BIC
    case ARM_INST_BIC_I:            // BIC
        if (IR.S()) {           // set condition codes
            ALUOutput = A & ~Operand2_val(IR, B, C, true);

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            // C flag set by shifter
            // V flag unchanged
        }
        else {
            ALUOutput = A & ~Operand2_val(IR, B, C, false);
        }
        break;

    case ARM_INST_MVN:              // MVN
    case ARM_INST_MVN_I:            // MVN
        if (IR.S()) {        // set condition codes
            ALUOutput = ~Operand2_val(IR, B, C, true);

            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            // C flag set by shifter
            // V flag unchanged
        }
        else {
            ALUOutput = ~Operand2_val(IR, B, C, false);
        }
        break;

        //
        // Single Data Transfer (base+offset)
        //
    case ARM_INST_LDR:
    case ARM_INST_LDRB:
    case ARM_INST_LDRSB:
    case ARM_INST_LDRH:
    case ARM_INST_LDRSH:

    case ARM_INST_STR:
    case ARM_INST_STRB:
    case ARM_INST_STRSB:
    case ARM_INST_STRH:
    case ARM_INST_STRSH:

        {
            ALUOutput = IR.U()? A + B: A - B; // calculate address
            sim_addr eaddr = IR.P()? ALUOutput: A;
	    cout << "effective address = " << hex << eaddr << endl;

            switch (IR.code()) {
            case ARM_INST_LDR:
                dcache->request_read_word(eaddr & ~3);
                break;

            case ARM_INST_LDRH:
            case ARM_INST_LDRSH:
                dcache->request_read_hword(eaddr & ~1);
                break;

            case ARM_INST_LDRB:
            case ARM_INST_LDRSB:
                dcache->request_read_byte(eaddr);
                break;

            case ARM_INST_STR:
                dcache->request_write_word(eaddr & ~3, C);
                break;

            case ARM_INST_STRH:
            case ARM_INST_STRSH:
                dcache->request_write_hword(eaddr & ~1, C);
                break;

            case ARM_INST_STRB:
            case ARM_INST_STRSB:
                dcache->request_write_byte(eaddr, C);
                break;

            default:
                assert(0);
                break;
            }
        }
        break;


        //
        // Single Data Transfer (base+index)
        //
    case ARM_INST_LDR_RR:
    case ARM_INST_LDRB_RR:
    case ARM_INST_STR_RR:
    case ARM_INST_STRB_RR:
        {
            int shmnt = IR.shRm()? Regs[IR.Rs()]: IR.shAmnt();
            int indx;
            bool cy;
#if 0
            switch (IR.shType()) {
            case 0: indx = LSL(B, shmnt, cy); break;
            case 1: indx = LSR(B, shmnt, cy); break;
            case 2: indx = ASR(B, shmnt, cy); break;
            case 3: indx = ROR(B, shmnt, cy); break;
	    default: assert(0); break;
            }
#else
	    sim_word (arm::*func[])(sim_word, int, bool &) = {
		&arm::LSL, &arm::LSR, &arm::ASR, &arm::ROR
	    };
	    indx = (this->*func[IR.shType()])(B, shmnt, cy);
#endif
            ALUOutput = IR.U()? A + indx: A - indx;
            sim_addr eaddr = IR.P()? ALUOutput: A;
	    cout << "effective address = " << hex << eaddr << endl;

            switch (IR.code()) {
            case ARM_INST_LDR_RR:
                dcache->request_read_word(eaddr & ~3);
                break;
            case ARM_INST_LDRB_RR:
                dcache->request_read_byte(eaddr);
                break;
            case ARM_INST_STR_RR:
                dcache->request_write_word(eaddr & ~3, C);
                break;
            case ARM_INST_STRB_RR:
                dcache->request_write_byte(eaddr, C);
                break;
            default:
                assert(0);
                break;
            }
        }
        break;


    case ARM_INST_LDRH_RR:
    case ARM_INST_LDRSH_RR:
    case ARM_INST_LDRSB_RR:
    case ARM_INST_STRH_RR:
    case ARM_INST_STRSH_RR:
    case ARM_INST_STRSB_RR:
        {
            ALUOutput = IR.U()? A + B: A - B; // calculate address
            sim_addr eaddr = IR.P()? ALUOutput: A;
	    cout << "effective address = " << hex << eaddr << endl;

            switch (IR.code()) {
            case ARM_INST_LDRH_RR:
            case ARM_INST_LDRSH_RR:
                dcache->request_read_hword(eaddr & ~1);
                break;

            case ARM_INST_LDRSB_RR:
                dcache->request_read_byte(eaddr);
                break;

            case ARM_INST_STRH_RR:
            case ARM_INST_STRSH_RR:
                dcache->request_write_hword(eaddr & ~1, C);
                break;

            case ARM_INST_STRSB:
                dcache->request_write_byte(eaddr, C);
                break;

            default:
                assert(0);
                break;
            }
        }
        break;

        //
        // Block Data Transfer (needs multi-cycle execution)
        //
    case ARM_INST_LDM:
    case ARM_INST_STM:
	if (B) {		// check register list is not empty
            ALUOutput = IR.U()? A + 4: A - 4;
            sim_addr eaddr = IR.P()? ALUOutput: A;
	    cout << "effective address = " << hex << eaddr << endl;
            switch (IR.code()) {
            case ARM_INST_LDM:
                dcache->request_read_word(eaddr & ~3);
                break;
            case ARM_INST_STM:
                C = Regs[next_regno(B, IR.U())];
                dcache->request_write_word(eaddr & ~3, C);
                break;
            default:
                assert(0);
                break;
            }
        }
        break;


        //
        // Data Swap
        //
    case ARM_INST_SWP:
        {
	    sim_addr eaddr = ALUOutput = A;
	    cout << "effective address = " << hex << eaddr << endl;
	    if (!multicycle_exe_f) { // first time
                dcache->request_read_word(eaddr & ~3);
	    }
	    else {		// second time
                dcache->request_write_word(eaddr & ~3, C);
	    }
	}
        break;

    case ARM_INST_SWPB:
        {
	    sim_addr eaddr = ALUOutput = A;
	    cout << "effective address = " << hex << eaddr << endl;
	    if (!multicycle_exe_f) { // first time
                dcache->request_read_byte(eaddr);
	    }
	    else {		// second time
                dcache->request_write_byte(eaddr, C);
	    }
	}
        break;

        //
        // Multiplication
        //
    case ARM_INST_MUL:
        ALUOutput = IR.A()? A + B*C: B*C;
        if (IR.S()) {           // set condition codes
            CPSR.set_N((ALUOutput & 0x80000000)? true: false);
            CPSR.set_Z(ALUOutput? false: true);
            // C flag value is meaningless (unpredictable result)
            // V flag unchanged
	}
        break;

    case ARM_INST_MULL:
        if (IR.SMul()) {        // signed multiply
            long long a = (signed)A;
            long long b = (signed)B;
            long long c = (signed)Regs[IR.Rn()];
            c = (c << 32)|Regs[IR.Rd()];

            long long v = IR.A()? c + a*b: a*b;

            ALUOutput = v;        // lower 32bit
            ALUOutput2 = v >> 32; // higher 32bit
        }
        else {                  // unsigned multiply
            unsigned long long a = A;
            unsigned long long b = B;
            unsigned long long c = Regs[IR.Rn()];
            c = (c << 32)|Regs[IR.Rd()];

            unsigned long long v = IR.A()? c + a*b: a*b;

            ALUOutput = v;        // lower 32bit
            ALUOutput2 = v >> 32; // higher 32bit
        }

        if (IR.S()) {           // set condition codes
            CPSR.set_N((ALUOutput2 & 0x80000000)? true: false);
            CPSR.set_Z((ALUOutput|ALUOutput2)? false: true);
            // C flag value is meaningless (unpredictable result)
            // V flag value is meaningless (unpredictable result)
        }

        cout << hex
             << " Mult(hi)=" << ALUOutput2
             << " Mult(lo)=" << ALUOutput << endl;
        break;

        //
        // Coprocessor Data Operation/Data Transfer/Register Transfer
        //
    case ARM_INST_CDP:
    case ARM_INST_LDC:
    case ARM_INST_STC:
    case ARM_INST_MCR:
    case ARM_INST_MRC:
        cerr << "coprocessor instruction not implemented" << endl;
        running_f = false;
        break;

        //
        // PSR transfer
        //
    case ARM_INST_MRS:
    case ARM_INST_MSR:
        cerr << "MRS/MSR is not implemented" << endl;
        running_f = false;
        break;

        //
        // software interrupt/system call
        //
    case ARM_INST_SWI:
        if (CondOK)
            do_swi(IR.UOffset24());
        break;

    default:
	cerr << "unknown instruction " << hex << IR << " " << IR.code() << endl;
        running_f = false;
	break;
    }

EX_done:
    cout << "CondOK=" << dec << CondOK << endl;
    cout << "ALUOutput=" << hex << ALUOutput << endl;
    cout << "ALUOutput2=" << hex << ALUOutput2 << endl;
    cout << "Flag(NZCV)="
         << CPSR.N() << CPSR.Z() << CPSR.C() << CPSR.V() << endl;
}

void
arm::MA()
{
    cout << "---------- MA ----------" << endl;;

    LMD = 0;                    // clear memory data bus

    if (!CondOK) {
        cout << "(ignored instruction)" << endl;
        goto MA_done;
    }

    //
    // dispatch
    //
    switch (IR.code()) {	// dispatch using decoded instruction type
        //
        // Branch (control transfer)
        //
    case ARM_INST_B:
    case ARM_INST_BL:
    case ARM_INST_BX:
        PC = ALUOutput;
        cout << "PC=" << hex << PC << endl;
        break;

        //
        // Data Processing
        //
    case ARM_INST_TST:          // TST (== AND without write-back)
    case ARM_INST_AND:          // AND
    case ARM_INST_TEQ:          // TEQ (== EOR without write-back)
    case ARM_INST_EOR:          // EOR
    case ARM_INST_CMP:          // CMP (== SUB without write-back)
    case ARM_INST_SUB:          // SUB
    case ARM_INST_RSB:          // RSB
    case ARM_INST_CMN:          // CMN (== ADD without write-back)
    case ARM_INST_ADD:          // ADD
    case ARM_INST_ADC:          // ADC
    case ARM_INST_SBC:          // SBC
    case ARM_INST_RSC:          // RSC
    case ARM_INST_ORR:          // ORR
    case ARM_INST_MOV:          // MOV
    case ARM_INST_BIC:          // BIC
    case ARM_INST_MVN:          // MVN

    case ARM_INST_TST_I:        // TST (== AND without write-back)
    case ARM_INST_AND_I:        // AND
    case ARM_INST_TEQ_I:        // TEQ (== EOR without write-back)
    case ARM_INST_EOR_I:        // EOR
    case ARM_INST_CMP_I:        // CMP (== SUB without write-back)
    case ARM_INST_SUB_I:        // SUB
    case ARM_INST_RSB_I:        // RSB
    case ARM_INST_CMN_I:        // CMN (== ADD without write-back)
    case ARM_INST_ADD_I:        // ADD
    case ARM_INST_ADC_I:        // ADC
    case ARM_INST_SBC_I:        // SBC
    case ARM_INST_RSC_I:        // RSC
    case ARM_INST_ORR_I:        // ORR
    case ARM_INST_MOV_I:        // MOV
    case ARM_INST_BIC_I:        // BIC
    case ARM_INST_MVN_I:        // MVN

        /* nothing to do */

        break;

        //
        // Single Data Transfer/Data Swap
        //
        // Data Load (base + offset)
    case ARM_INST_LDR:
    case ARM_INST_LDRB:
    case ARM_INST_LDRSB:
    case ARM_INST_LDRH:
    case ARM_INST_LDRSH:
        // Data Load (base + index)
    case ARM_INST_LDR_RR:
    case ARM_INST_LDRB_RR:
    case ARM_INST_LDRSB_RR:
    case ARM_INST_LDRH_RR:
    case ARM_INST_LDRSH_RR:

        // Data Store (base + offset)
    case ARM_INST_STR:
    case ARM_INST_STRB:
    case ARM_INST_STRSB:
    case ARM_INST_STRH:
    case ARM_INST_STRSH:
        // Data Store (base + index)
    case ARM_INST_STR_RR:
    case ARM_INST_STRB_RR:
    case ARM_INST_STRSB_RR:
    case ARM_INST_STRH_RR:
    case ARM_INST_STRSH_RR:

        // Data Swap
    case ARM_INST_SWP:
    case ARM_INST_SWPB:

        if (!dcache->is_done()) {
            cout << "*** MA stalled ***" << endl;
            set_stall_condition();
        }
        else {
            sim_addr eaddr = IR.P()? ALUOutput: A;
            cout << "effective address = " << hex << eaddr << endl;

            switch (IR.code()) {
            case ARM_INST_LDR:     // Word
            case ARM_INST_LDR_RR:  // Word
                {
                    dcache->reply_read_word(eaddr & ~3, LMD);
                    // LMD is rotated according to lower 2bit of eaddr
                    // (little-endian assumed)
                    bool cy;
                    LMD = ROR(LMD, (eaddr & 3)*8, cy);
                }
                break;

            case ARM_INST_LDRH:    // Unsigned halfword
            case ARM_INST_LDRH_RR: // Unsigned halfword
                {
                    sim_hword v;
                    dcache->reply_read_hword(eaddr & ~1, v);
                    // LMD is rotated according to lower 1bit of eaddr
                    // (little-endian assumed)
                    bool cy;
                    LMD = ROR(v, (eaddr & 1)*16, cy);
                }
                break;

            case ARM_INST_LDRSH:   // Signed halfword
            case ARM_INST_LDRSH_RR: // Signed halfword
                {
                    sim_hword v;
                    dcache->reply_read_hword(eaddr & ~1, v);
                    // LMD is rotated according to lower 1bit of eaddr
                    // (little-endian assumed)
                    bool cy;
                    v = ROR(v, (eaddr & 1)*16, cy);
                    // signed extend
                    LMD = (v & 0x8000)? 0xffff0000|v: v;
                }
                break;

            case ARM_INST_LDRB:    // Unsigned byte
            case ARM_INST_LDRB_RR: // Unsigned byte
                {
                    sim_byte v;
                    dcache->reply_read_byte(eaddr, v);
                    LMD = v;
                }
                break;

            case ARM_INST_LDRSB:   // Signed byte
            case ARM_INST_LDRSB_RR: // Signed byte
                {
                    sim_byte v;
                    dcache->reply_read_byte(eaddr, v);
                    // signed extend
                    LMD = (v & 0x80)? 0xffffff00|v: v;
                }
                break;


            case ARM_INST_STR:     // Word
            case ARM_INST_STR_RR:  // Word
                dcache->reply_write_word(eaddr & ~3);
                break;

            case ARM_INST_STRH:    // Unsigned halfword
            case ARM_INST_STRH_RR: // Unsigned halfword
            case ARM_INST_STRSH:   // Signed halfword
            case ARM_INST_STRSH_RR: // Signed halfword
                dcache->reply_write_hword(eaddr & ~1);
                break;

            case ARM_INST_STRB:    // Unsigned byte
            case ARM_INST_STRB_RR: // Unsigned byte
            case ARM_INST_STRSB:   // Signed byte
            case ARM_INST_STRSB_RR: // Signed byte
                dcache->reply_write_byte(eaddr);
                break;


                // Data Swap
            case ARM_INST_SWP:
                if (!multicycle_exe_f) { // first time
                    dcache->reply_read_word(eaddr & ~3, LMD);
                    // LMD is rotated according to lower 2bit of eaddr
                    // (little-endian assumed)
                    bool cy;
                    LMD = ROR(LMD, (eaddr & 3)*8, cy);
                }
                else {		// second time
                    dcache->reply_write_word(eaddr & ~3);
                }
                break;

            case ARM_INST_SWPB:
                if (!multicycle_exe_f) { // first time
                    sim_byte v;
                    dcache->reply_read_byte(eaddr, v);
                    LMD = v;
                }
                else {		// second time
                    dcache->reply_write_byte(eaddr);
                }
                break;

            default:
                assert(0);
                break;
            }
        }
        break;

        //
        // Block Data Transfer (needs multi-cycle execution)
        //
    case ARM_INST_LDM:
    case ARM_INST_STM:
	if (B) {		// check register list is not empty
            if (!dcache->is_done()) {
                cout << "*** MA stalled ***" << endl;
                set_stall_condition();
            }
            else {
                sim_addr eaddr = IR.P()? ALUOutput: A;
		cout << "effective address = " << hex << eaddr << endl;

                if (IR.Ld())
                    dcache->reply_read_word(eaddr & ~3, LMD);
                else
                    dcache->reply_write_word(eaddr & ~3);
            }
        }
        break;

        //
        // Multiplication
        //
    case ARM_INST_MUL:
    case ARM_INST_MULL:
        /* nothing to do */
        break;

        //
        // Coprocessor Data Operation/Data Transfer/Register Transfer
        //
    case ARM_INST_CDP:
    case ARM_INST_LDC:
    case ARM_INST_STC:
    case ARM_INST_MCR:
    case ARM_INST_MRC:
        cerr << "coprocessor instruction not implemented" << endl;
        running_f = false;
        break;

        //
        // PSR transfer
        //
    case ARM_INST_MRS:
    case ARM_INST_MSR:
        cerr << "MRS/MSR is not implemented" << endl;
        running_f = false;
        break;

        //
        // software interrupt/system call
        //
    case ARM_INST_SWI:
        /* nothing to do */
        break;

    default:
	cerr << "unknown instruction " << hex << IR << " " << IR.code() << endl;
        running_f = false;
	break;
    }

MA_done:
    cout << "LMD=" << hex << LMD << endl;
}

void
arm::WB()
{
    cout << "---------- WB ----------" << endl;;

    if (!CondOK) {
        cout << "(ignored instruction)" << endl;
        goto WB_done;
    }

    //
    // dispatch
    //
    switch (IR.code()) {	// dispatch using decoded instruction type
        //
        // Branch (control transfer)
        //
    case ARM_INST_B:
    case ARM_INST_BX:
        // nothing to be written back
        break;

    case ARM_INST_BL:
        Regs[14] = ALUOutput2;
        cout << "LR=" << hex << Regs[14] << endl;
        break;

        //
        // Data Processing
        //
    case ARM_INST_AND:          // AND
    case ARM_INST_EOR:          // EOR
    case ARM_INST_SUB:          // SUB
    case ARM_INST_RSB:          // RSB
    case ARM_INST_ADD:          // ADD
    case ARM_INST_ADC:          // ADC
    case ARM_INST_SBC:          // SBC
    case ARM_INST_RSC:          // RSC
    case ARM_INST_ORR:          // ORR
    case ARM_INST_MOV:          // MOV
    case ARM_INST_BIC:          // BIC
    case ARM_INST_MVN:          // MVN

    case ARM_INST_AND_I:        // AND
    case ARM_INST_EOR_I:        // EOR
    case ARM_INST_SUB_I:        // SUB
    case ARM_INST_RSB_I:        // RSB
    case ARM_INST_ADD_I:        // ADD
    case ARM_INST_ADC_I:        // ADC
    case ARM_INST_SBC_I:        // SBC
    case ARM_INST_RSC_I:        // RSC
    case ARM_INST_ORR_I:        // ORR
    case ARM_INST_MOV_I:        // MOV
    case ARM_INST_BIC_I:        // BIC
    case ARM_INST_MVN_I:        // MVN

        Regs[IR.Rd()] = ALUOutput; // written back to register
        cout << "Reg[" << dec << IR.Rd() << "]=" << hex << Regs[IR.Rd()] << endl;
        break;

    case ARM_INST_TST:          // TST (== AND without write-back)
    case ARM_INST_TEQ:          // TEQ (== EOR without write-back)
    case ARM_INST_CMP:          // CMP (== SUB without write-back)
    case ARM_INST_CMN:          // CMN (== ADD without write-back)

    case ARM_INST_TST_I:        // TST (== AND without write-back)
    case ARM_INST_TEQ_I:        // TEQ (== EOR without write-back)
    case ARM_INST_CMP_I:        // CMP (== SUB without write-back)
    case ARM_INST_CMN_I:        // CMN (== ADD without write-back)
        // nothing to be written back
        break;

        //
        // Single Data Transfer/Data Swap
        //

        // Data Load
    case ARM_INST_LDR:
    case ARM_INST_LDRH:
    case ARM_INST_LDRSH:
    case ARM_INST_LDRB:
    case ARM_INST_LDRSB:
    case ARM_INST_LDR_RR:
    case ARM_INST_LDRB_RR:
    case ARM_INST_LDRH_RR:
    case ARM_INST_LDRSH_RR:
    case ARM_INST_LDRSB_RR:
        assert(IR.Ld());

        Regs[IR.Rd()] = LMD;
        cout << "Reg[" << dec << IR.Rd() << "]=" << hex << Regs[IR.Rd()] << endl;

        if (IR.W()) {           // Write-back bit
            Regs[IR.Rn()] = ALUOutput;
            cout << "Reg[" << dec << IR.Rn() << "]=" << hex << Regs[IR.Rn()] << endl;
        }
        break;

        // Data Store
    case ARM_INST_STR:
    case ARM_INST_STRH:
    case ARM_INST_STRSH:
    case ARM_INST_STRB:
    case ARM_INST_STRSB:
    case ARM_INST_STR_RR:
    case ARM_INST_STRB_RR:
    case ARM_INST_STRH_RR:
    case ARM_INST_STRSH_RR:
    case ARM_INST_STRSB_RR:
        assert(!IR.Ld());

        if (IR.W()) {           // Write-back bit
            Regs[IR.Rn()] = ALUOutput;
            cout << "Reg[" << dec << IR.Rn() << "]=" << hex << Regs[IR.Rn()] << endl;
        }
        break;

        //
        // Data Swap
        //
    case ARM_INST_SWP:
    case ARM_INST_SWPB:
        if (!multicycle_exe_f) { // first time
            Regs[IR.Rd()] = LMD;
            cout << "Reg[" << dec << IR.Rd() << "]=" << hex << Regs[IR.Rd()] << endl;

            multicycle_exe_f = true;
        }
        else {                  // second time
            multicycle_exe_f = false;
        }
        break;

        //
        // Block Data Transfer (needs multi-cycle execution)
        //
    case ARM_INST_LDM:
    case ARM_INST_STM:
	if (B) {		// check register list is not empty
	    int regno = next_regno(B, IR.U());
	    assert(regno >= 0 && regno < 16);

	    if (IR.Ld()) {	// Load
		Regs[regno] = LMD;
		cout << "Reg[" << dec << regno << "]=" << hex << Regs[regno] << endl;
	    }
	    else {
		cout << "Reg[" << dec << regno << "] stored (" << hex << Regs[regno] << ")"
                     << endl;
	    }

	    A = ALUOutput;
	    B &= ~(1 << regno);	// clear one bit (for current register number)

	    // check multi-cycle execution
            multicycle_exe_f = B? true: false;
	    if (!multicycle_exe_f) {
		if (IR.W()) {       // Write-back bit
		    Regs[IR.Rn()] = ALUOutput;
		    cout << "Reg[" << dec << IR.Rn() << "]=" << hex << Regs[IR.Rn()] << endl;
		}
	    }
	    else
		cout << " (now in multi-cycle)" << endl;
        }
        break;

        //
        // Multiplication
        //
    case ARM_INST_MUL:
        Regs[IR.Rn()] = ALUOutput;
        cout << "Reg[" << dec << IR.Rn() << "]=" << hex << Regs[IR.Rn()] << endl;
        break;

    case ARM_INST_MULL:
        Regs[IR.Rn()] = ALUOutput2;
        Regs[IR.Rd()] = ALUOutput;
        cout << "Reg[" << dec << IR.Rn() << "]=" << hex << Regs[IR.Rn()] << endl;
        cout << "Reg[" << dec << IR.Rd() << "]=" << hex << Regs[IR.Rd()] << endl;
        break;

        //
        // Coprocessor Data Operation/Data Transfer/Register Transfer
        //
    case ARM_INST_CDP:
    case ARM_INST_LDC:
    case ARM_INST_STC:
    case ARM_INST_MCR:
    case ARM_INST_MRC:
        cerr << "coprocessor instruction not implemented" << endl;
        running_f = false;
        break;

        //
        // PSR transfer
        //
    case ARM_INST_MRS:
    case ARM_INST_MSR:
        cerr << "MRS/MSR is not implemented" << endl;
        running_f = false;
        break;

        //
        // software interrupt/system call
        //
    case ARM_INST_SWI:
        /* nothing to do */
        break;

    default:
	cerr << "unknown instruction " << hex << IR << " " << IR.code() << endl;
        running_f = false;
	break;
    }

WB_done:
    cout << "IADDR=" << hex << IADDR << endl;
    cout << "PC=" << hex << PC << endl;
    cout << "IR=" << hex << IR << endl;

    if (!multicycle_exe_f)
        cout << "(completed)" << endl;
}

void
arm::do_swi(int fcode)
{
    cout << "SWI called!" << endl;

    //
    // SWI handler interface
    //
    // input
    // fcode: function code
    // r0,r1,r2,r3: arg1,arg2,arg3,arg4
    //
    // output
    // r0: return value
    //

    // you can add your new break handler routines here
    switch (fcode) {

	// program termination
    case BREAK_PROGRAM_EXIT:
	running_f = false;
	break;

	// read data
    case BREAK_READ_INT:
        {
	    int v;
	    cin >> v;
	    Regs[0] = v;

	    cout << "read value(int) = " << v << endl;
	}
	break;
    case BREAK_READ_CHAR:
        {
	    char c;
	    cin >> c;
	    Regs[0] = c;

	    cout << "read value(char) = " << c << endl;
	}
	break;
    case BREAK_READ_STRING:
	{
	    char tmpbuf[2048];
	    cin >> tmpbuf;
	    for (int i = 0; tmpbuf[i] != '\0'; i++)
		mem->write_byte(Regs[0] + i, tmpbuf[i]);

	    cout << "read value(string) = " << tmpbuf << endl;
	}
	break;

	// print data
    case BREAK_PRINT_INT:
	cout << dec << Regs[0] << endl;
	break;
    case BREAK_PRINT_CHAR:
	cout << (char)(Regs[0]) << endl;
	break;
    case BREAK_PRINT_STRING:
	for (int i = 0; ; i++) {
	    char c = mem->read_byte(Regs[0] + i);
	    if (c == '\0')
		break;

	    cout << c;
	}
	cout << endl;
	break;

    default:
        cerr << "unknown SWI fcode " << fcode << endl;
        break;
    }
}

// end of arm.cc