16-bit holidays!
What could be better to spice up your winter holidays than some 16 bit x86 assembly? As an exercise for removing the rust from my assembly-fu, I wrote a small Base64 decoder purely in 8086 assembly code:
- it decodes a well-encoded message 4 bytes at at time and prints 3 characters to the screen;
- handles padding
=s; - it does not handle faulty encodings;
- should use opcodes from the original 8086 only (unless I got it wrong);
- real mode code, meant to be embedded into a Master Boot Record;
- video output is done using the BIOS via
int 10h.
How to assemble
You’ll need the Flat Assembler (fasm) to assemble the program.
With that installed, just run:
fasm base64.s
to get base64.bin as output. The file is already formatted to look
like a valid MBR: it’s 512 bytes long and ends with 55 AA, so it can
be run directly using an emulator like bochs or qemu.
How to run
Once compiled, the simplest way to try it out is to have qemu run it
as if it was a floppy image. It is obviously too small to be a full
floppy, but qemu does not complain and we do no try to read additional
sectors:
qemu-system-x86_64 -fda base64.bin
What does it print? Well, why don’t you try it for yourself? But if you are in a hurry, there is a picture of it at the end of the page.
Of course, you can change the message and rebuild.
The code
; A base64 decoder, written in 8086 assembly language.
; Assemble using the Flat Assembler (https://flatassembler.net/)
; Emit a raw binary file, and arrange for labels to start at 0
format binary
org 0
; Entry point
; BIOS loads us at 0x7C00, but we don't know the exact segment/offset combo:
; it could be 07C0:0000, 0000:7C00 or any other equivalent form.
; So the program lives at [0x7C00, 0x7DFF].
start:
; Place the stack at [0x7E00, 0x7FFF) and enable interrupts
cli
cld
mov ax, 07E0h
mov ss, ax
mov sp, 0200h
sti
; Jump to the label below while normalizing the segment/offset pair
; so that effective addresses are correct w.r.t. "org"
mov ax, 07C0h
mov ds, ax ; init data segments
mov es, ax ; ...
push ax
mov ax, @f
push ax
retf
@@:
mov si, msg
; "chunk" processes 4 bytes of base64 data, yielding 3 characters
chunk:
; End of string?
cmp byte [si], 0
je endloop
xor bx, bx
mov cx, 4
mov di, c1
; Initially mark c3 and c4 as padded
mov word [c3], 0FFFFh
b64Char:
lodsb
; If we read a =, the chunk is padded and there
; are no more bits to read.
cmp al, '='
je printer
mov bl, al
mov al, [bx + base64]
stosb
loop b64Char
; As soon as we get here, we have read 4 b64 characters and
; placed them into c1, c2, c3, c4. Only the least significant
; 6 bits for each cX are used. But for c3 and c4, if the MSB
; is set it means there is no character because the block was
; padded.
; Visually: O = clear bit, I = set bit
; 1/2/3 = bit belongs to decoded char 1/2/3
; c1 = OO111111
; c2 = OO112222
; c3 = OO222233 | IIIIIIII (padding)
; c4 = OO333333 | IIIIIIII (padding)
printer:
; Reassemble decoded character 1 from c1 and c2
mov cx, 2
mov ax, word [c1] ; also read c2
xchg al, ah
shl al, cl
mov cl, 6
shr ax, cl
call print_char
; Reassemble decoded character 2 from c2 and c3,
; but only if we have a c3
test byte [c3], 80h
jnz endloop
mov cl, 4
mov al, [c2]
shl al, cl
mov ah, [c3]
shr cl, 1
shr ah, cl
or al, ah
call print_char
; Reassemble decoded character 3 from c3 and c4,
; but only if we have a c4
test byte [c4], 80h
jnz endloop
mov ax, word [c3] ; also read c4
xchg al, ah
shl al, cl
shr ax, cl
call print_char
jmp chunk
endloop:
hlt
jmp endloop
; Put print boilerplate in a procedure
; It expects the characters in AL and trashes AH, BH
print_char:
push cx
xor bh, bh
xor cx, cx
mov ah, 0Eh
int 10h
pop cx
retn
; Message to decode
msg db "SGFwcHkgaG9saWRheXMhISE=", 0
; Table converting ASCII characters to their base64 group of 6 bits.
; It starts at the lowest used character (+), that's why we use "virtual"
; to adjust the offest below.
chartab db 62 ; +
rb 3
db 63 ; /
db 52, 53, 54, 55, 56, 57, 58, 59, 60, 61 ; 0-9
rb 7
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ; A-Z
db 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 ; ...
rb 6
db 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38 ; a-z
db 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 ; ...
; The first used char is + (43)
virtual at chartab - '+'
base64 db ?
end virtual
; Base64 bytes under examination
; Must be word-aligned, because we read them as words in some cases
align 2
c1 db ?
c2 db ?
c3 db ?
c4 db ?
; Pad the program to 510 bytes, and append the MBR signature at the end.
; This makes it a proper MBR that can be executed by BIOS.
rb 510 - $
db 055h
db 0AAh
Sample output
