Masm assembly 8086 carry flag between data word ad

2020-04-18 03:22发布

问题:

So I have this problem I'm supposed to solve and I've spent hours trying to figure out the best way to do this, google hasn't been of much help.

The problem is to create a subroutine that is given a list of words that you then add with another list that becomes the output. Its basically a method of working with large numbers.

My code works fine for carry flags within words, but for carry flag from one full word to another it does not work. The first 16 bit word (0005 in the example below) is a flag used to tell my subroutine how many words there are.

For instance, given the following input,

//si     0005 0000 EEEE DDDD CCCC BBBB
//di     0005 0000 1111 2222 3333 4445

when the output expected is:

0005 0001 0000 0000 0000 0000

My code produces:

0005 0000 FFFF FFFF FFFF 0000 

I believe I understand why this is happening for the most part, but am unsure of the best way to solve this issue. I need a low cost method of carrying over a 1 between different chunks of data.

;---------------------------------------
; ADD Subroutine
;---------------------------------------
    .data

    bxx dw 0000h                        ;
    cxx dw 0000h                        ;

    .code
;---------------------------------------
addx:                                   ;
    mov bxx, bx                         ;save incoming register
    mov cxx, cx                         ;save incoming register
    mov bx, si                          ;move n to bl - acts as a cursor
loopAdd:                                ;loop point
    mov cx, [si+bx]                     ;move word at point si+bx into register cx
    ADC [di+bx], cx                     ;add with carry  
    sub bx, 0002h;                      ;decrement cursor by a full word
    cmp bx, 0000h                       ;bx == 0?
    jg loopAdd                          ;no? jump to loop point
end:                                    ;
    mov bx, bxx                         ;return register to original state
    mov cx, cxx                         ;return register to original state
    ret                                 ;return
;---------------------------------------

回答1:

You have to save the carry flag from the previous iteration.

Try this:

;---------------------------------------
; ADD Subroutine
;---------------------------------------
    .data

    bxx dw 0000h                        ;
    cxx dw 0000h                        ;

    .code
;---------------------------------------
addx:                                   ;
    mov bxx, bx                         ;save incoming register
    mov cxx, cx                         ;save incoming register
    mov bx, si                          ;move n to bl - acts as a cursor
    clc                                 ;clear carry flag
    pushf                               ;save flag register
loopAdd:                                ;loop point
    mov cx, [si+bx]                     ;move word at point si+bx into register cx
    popf                                ;restore saved flag register
    ADC [di+bx], cx                     ;add with carry
    pushf                               ;save flag register
    sub bx, 0002h;                      ;decrement cursor by a full word
    jg loopAdd                          ;if the result is positive, jump to loop point
end:                                    ;
    popf                                ;remove saved flag register from the stack
    mov bx, bxx                         ;return register to original state
    mov cx, cxx                         ;return register to original state
    ret                                 ;return
;---------------------------------------

Note that cmp bx, 0000h isn't needed because cmp is sub except for cmp only modify flags and don't store calculated value, so you can check the result of sub directly.



回答2:

If you want fast multi-precision addition, use 64bit code if at all possible. Doing 4x the width with every instruction directly gives a 4x speedup. On 386-compatible CPUs, you can use 32bit instructions and registers even in 16bit mode, which will give you a 2x speedup.


To take Ira's unroll suggestion a step further

  • reducing loop overhead by one lea
  • avoiding adc with a memory destination, which is slow on Intel.

    ... set up for the unrolled loop, with Ira's setup code
    ; di = dst pointer
    ; bx = src-dst, so bx+di = src
add_8words: ; carry bit has value to propagate
    ;sahf   ; another way to avoid a partial-flag stall while looping
    mov  ax, 0[bx+di]
    adc  ax, 0[di]
    mov  0[di], ax
    mov  ax, -1[bx+di]
    adc  ax, -1[di]
    mov  -1[di], ax
    mov  ax, -2[bx+di]
    adc  ax, -2[di]
    mov  -2[di], ax
    mov  ax, -3[bx+di]
    adc  ax, -3[di]
    mov  -3[di], ax
    mov  ax, -4[bx+di]
    adc  ax, -4[di]
    mov  -4[di], ax
    mov  ax, -5[bx+di]
    adc  ax, -5[di]
    mov  -5[di], ax
    mov  ax, -6[bx+di]
    adc  ax, -6[di]
    mov  -6[di], ax
    mov  ax, -7[bx+di]
    adc  ax, -7[di]
    mov  -7[di], ax

    lea   di, -8[di]
    ; lahf
    ; put the flag-setting insn next to the branch for potential macro-fusion
    dec   cx             ; causes a partial-flag stall, but only once per 8 adc
    jne   add_8word

This should run at nearly one adc per clock (minus loop overhead) on Intel Broadwell, and on AMD K8 through Bulldozer. (I forget if k8/k10 can do two loads per clock. That'll be the bottleneck if it can't). Using adc with a memory destination is not good on Intel, but fine on AMD, according to Agner Fog's tables. Intel Haswell and earlier will be limited by the 2c latency of adc. (Broadwell made adc and cmov single-uop instructions, making use of the 3-dependency uop support added in Haswell so FMA can be a single uop instruction).

A loop insn might be a win on older CPUs where a partial-reg stall is really bad, but other ways to avoid a partial-flag stall might be even better than the slow loop instruction.

Using the dest-source trick reduces the loop overhead of decrementing the pointer. 2-register addressing modes in loads don't need to micro-fuse, because pure mov loads are a single uop anyway. stores do need to micro-fuse, so you should prefer one-register addressing modes for stores. Haswell's extra store-address unit on port7 can only handle simple addressing modes, and 2-register addressing modes can't micro-fuse.

See also Problems with ADC/SBB and INC/DEC in tight loops on some CPUs for info about multiprecision adc loops and some experiments on Core2 vs. SnB CPUs for partial-flag stall performance.

Another way to loop here would be lea si, -1[si] / mov cx, si / jcxz. 16bit sucks, and you can't use [cx] in an effective address.



回答3:

OP says he wants a low cost solution to preserve the carry between iterations. @MikeCAT had a solution; @PeterCordes suggested some improvements.

There's one more really nice improvement you can get when doing multiprecision arithmetic, under the assumption that your multiprecison value is "big" (contains many word values), and that's unrolling the inner loop N times, avoiding loop counting/carry flag damage inside the unrolled section. (If your multiprecision arithmetic is not very multi, you don't need a lot of optimization).

I've revised @MikeCAT's answer here, with the assumption that the unrolling should be 8 iterations.

The code has 3 parts: determining how to handle a fragment of 8 words, handling the fragment in an unrolled way, and then handling multiple 8 word chunks efficiently in the main unrolled loop.

For OPs' example of 5 words, this routine never gets to the full unrolled loop. For large multiword values, it does, and I'd expect this routine is probably pretty fast.

[The following code is untested.]

;---------------------------------------
; ADD Subroutine
;   si = pointer to count word of 1st multiprecision value
;   di = pointer to count word of 2nd multiprecision value
;   assert: si->count == di ->count
;   preserves si, di; exits with carry from addition
;---------------------------------------
sizeofword equ 2
;---------------------------------------
add_multiple: ; destroys ax, si, di
    push cx                             ;save incoming register
    mov  cx, [si]
    lea  si, sizeofword[si+cx]          ; find least significant word  
    lea  di, sizeofword[di+cx]

; determine entry point into unrolled loop by taking counter modulo 8
    mov cx, si                          ;move n to bl - acts as a cursor
    shr cl, 1 
    jc  add_xx1
    je  done                            ; edge case: 0 words in value
add_xx0:
    shr cl, 1
    jc  add_x10
add_x00:
    shr cl, 1
    jnc add_000                         ; note carry flag is clear                         
;   clc                                
;   jmp add_100
    mov  ax, 0[si]                      
    add  0[di], ax                      ; do 1st add without carry
    lea  si, -1[si]
    lea  di, -1[di]
    jmp  add_011

add_x10:
    shr cl, 1
    jnc add_010
;   clc
;   jmp add_110
    mov  ax, 0[si]
    add  0[di], ax
    lea  si, -1[si]
    lea  di, -1[di]
    jmp  add_101

add_x01:
    shr cl, 1
    jnc add_001
;   clc
;   jmp add_101
    mov  ax, 0[si]
    adc  0[di], ax
    lea  si, -1[si]
    lea  di, -1[di]
    jmp  add_100

add_xx1:
    shr cl, 1
    jnc add_x01
add_x11:
    shr cl, 1
    jnc add_011
;   clc
;   jmp add_111

; the following code adds a fragment of an 8 word block
add_111: ; carry bit has value to propagate
    mov  ax, 0[si]         
;   adc  0[di], ax
    add  0[di], ax                             ; no carry in on 1st add
    lea  si, -1[si]
    lea  di, -1[di]
add_110:
    mov  ax, 0[si]
    adc  0[di], ax
    lea  si, -1[si]
    lea  di, -1[di]
add_101:
    mov  ax, 0[si]
    adc  0[di], ax
    lea  si, -1[si]
    lea  di, -1[di]
add_100:
    mov  ax, 0[si]
    adc  0[di], ax
    lea  si, -1[si]
    lea  di, -1[di]
add_011:
    mov  ax, 0[si]
    adc  0[di], ax
    lea  si, -1[si]
    lea  di, -1[di]
add_010:
    mov  ax, 0[si]
    adc  0[di], ax
    lea  si, -1[si]
    lea  di, -1[di]
add_001:
    mov  ax, 0[si]
    adc  0[di], ax
    lea  si, -1[si]
    lea  di, -1[di]
add_000:
    mov  ax, 0[si]
    adc  0[di], ax
    dec   cx                     ; does not disturb carry
    lea  si, -1[si]
    lea  di, -1[di]
    je    done

; unrolled loop here; very low overhead
add_8words: ; carry bit has value to propagate
    mov  ax, 0[si]
    adc  0[di], ax
    mov  ax, -1[si]
    adc  -1[di], ax
    mov  ax, -2[si]
    adc  -2[di], ax
    mov  ax, -3[si]
    adc  -3[di], ax
    mov  ax, -4[si]
    adc  -4[di], ax
    mov  ax, -5[si]
    adc  -5[di], ax
    mov  ax, -6[si]
    adc  -6[di], ax
    mov  ax, -7[si]
    adc  -7[di], ax
    dec   cx
    lea   si, -8[si]
    lea   di, -8[di]
    jne   add_8word
done: pop  cx
    ret

;---------------------------------------

The sequence

    mov  ax, 0[si]
    adc  0[di], ax
    lea  si, -1[si]
    lea  di, -1[di]

suggests perhaps using the single-word block move instructions as an alternative:

    std                          ; want to step backward
    ...
    lods
    adc  ax, 0[di]
    stos
    ...
    cld
    ret

with appropriate adjustments to the code, left to the reader.

Whether the loop I wrote or the LODS/STOS version is faster is something to measure carefully.