diff --git a/float16.cr b/float16.cr index b363372..b7fc90c 100644 --- a/float16.cr +++ b/float16.cr @@ -4,57 +4,42 @@ f32_to_f16 0.0 f32_to_f16 -1.0 f32_to_f16 0.15625 f32_to_f16 (1_f32 / 0_f32).as(Float32) +f32_to_f16 -(1_f32 / 0_f32).as(Float32) +f32_to_f16 100_000.0 +f32_to_f16 -100_000.0 +f32_to_f16 10.539187535151395835581398159855 +f32_to_f16 -10.539187535151395835581398159855 def binary_8(v) sprintf "%08b", v & 0xFF end def binary_16(v) - sprintf "%08b %08b", - (v >> 8) & 0xFF, - v & 0xFF + sprintf "%08b %08b", (v >> 8) & 0xFF, v & 0xFF end def binary_24(v) - sprintf "%08b %08b %08b", - (v >> 16) & 0xFF, - (v >> 8) & 0xFF, - v & 0xFF -end - -def binary_mantisse(v) - binary_24(v)[1..] + sprintf "%08b %08b %08b", (v >> 16) & 0xFF, (v >> 8) & 0xFF, v & 0xFF end def binary_32(v) - sprintf "%08b %08b %08b %08b", - (v >> 24) & 0xFF, - (v >> 16) & 0xFF, - (v >> 8) & 0xFF, - v & 0xFF + sprintf "%08b %08b %08b %08b", (v >> 24) & 0xFF, (v >> 16) & 0xFF, (v >> 8) & 0xFF, v & 0xFF end -def print_summary(value : Float32) - - # 0 or 1 - sign = (buffer[0].to_u32 >> 7) - # 8-bit value - exp = ((buffer[0].to_u32 & 0x7F) << 1) | (buffer[1].to_u32 >> 7) - # 23-bit value - man = (buffer[1].to_u32 << 16) | (buffer[2].to_u32 << 8) | buffer[3].to_u32 - - str_value = "%10.6f" % value - str_sign = binary_8(sign)[-1] - str_exp = binary_8(exp) - str_man = binary_mantisse(man) - - puts "#{str_value} => #{str_sign} #{str_exp} #{str_man}" +def binary_mantisse_f32(v) + binary_24(v)[1..] # mantisse only is 23-bit, remove the first represented bit end -def print_summary(value : Float32) +def binary_mantisse_f16(v) + binary_16(v)[6..] # mantisse only is 10-bit, remove the first represented bits end -def f32_to_f16(value : Float32) + +def get_buffer(value : UInt16) + [ ((value >> 8) & 0xFF).to_u8, (value & 0xFF).to_u8 ] +end + +def get_buffer(value : Float32) # TODO: is there a simpler way to perform binary operations over a float? # Extract IEEE754 components io = IO::Memory.new @@ -66,7 +51,11 @@ def f32_to_f16(value : Float32) raise "cannot perform f32 to f16 on value #{value}" end - buffer = v.to_slice + v.to_slice +end + +def get_summary(value : Float32) + buffer = get_buffer value # 0 or 1 sign = (buffer[0].to_u32 >> 7) @@ -75,61 +64,112 @@ def f32_to_f16(value : Float32) # 23-bit value man = (buffer[1].to_u32 << 16) | (buffer[2].to_u32 << 8) | buffer[3].to_u32 - print_summary value, buffer + str_value = "%15.6f" % value + str_sign = binary_8(sign)[-1] + str_exp = "%15s" % binary_8(exp) + str_man = "%30s" % binary_mantisse_f32(man) + + "32-bit: #{str_value} => #{str_sign} #{str_exp} #{str_man}" +end + +# Float16 in a UInt16 value +def get_summary(value : UInt16) + buffer = get_buffer value + + # 1-bit value + sign = (buffer[0].to_u32 >> 7) + # 5-bit value + exp = (buffer[0].to_u32 & 0x7F) >> 2 + # 23-bit value + man = ((buffer[0].to_u32 & 0x03) << 6) | (buffer[1].to_u32 << 8) + + str_value = "%15d" % value + str_sign = binary_8(sign)[-1] # 1-bit value + str_exp = "%15s" % binary_8(exp)[3..7] # 5-bit value + str_man = "%30s" % binary_mantisse_f16(man) # 10-bit value + + "16-bit: #{str_value} => #{str_sign} #{str_exp} #{str_man}" +end + +def f32_to_f16(value : Float32) + + puts get_summary value + buffer = get_buffer value + + # 0 or 1 + sign = (buffer[0].to_u32 >> 7) + # 8-bit value + exp = ((buffer[0].to_u32 & 0x7F) << 1) | (buffer[1].to_u32 >> 7) + # 23-bit value + man = (buffer[1].to_u32 << 16) | (buffer[2].to_u32 << 8) | buffer[3].to_u32 # Check for all exponent bits being set, which is Infinity or NaN if exp == 0xFF - puts "exp == 0xFF" # Set mantissa MSB for NaN (and also keep shifted mantissa bits) nan_bit = man == 0 ? 0 : 0x0200 - pp! binary_24(nan_bit) - float16_value = ((sign << 15) | 0x7C00 | nan_bit | man) & 0xFFFF - f16_value = sprintf "%08b %08b", float16_value >> 8, float16_value & 0xFF - pp! f16_value + float16_value = (((sign << 15) | 0x7C00 | nan_bit | man) & 0xFFFF).to_u16 + puts "#{get_summary float16_value} => inf or nan" return float16_value end - return 0 + # The number is normalized, start assembling half precision version + half_sign = sign << 15 -# // The number is normalized, start assembling half precision version -# let half_sign = sign >> 16; -# // Unbias the exponent, then bias for half precision -# let unbiased_exp = ((exp >> 23) as i32) - 127; -# let half_exp = unbiased_exp + 15; -# -# // Check for exponent overflow, return +infinity -# if half_exp >= 0x1F { -# return (half_sign | 0x7C00u32) as u16; -# } -# -# // Check for underflow -# if half_exp <= 0 { -# // Check mantissa for what we can do -# if 14 - half_exp > 24 { -# // No rounding possibility, so this is a full underflow, return signed zero -# return half_sign as u16; -# } -# // Don't forget about hidden leading mantissa bit when assembling mantissa -# let man = man | 0x0080_0000u32; -# let mut half_man = man >> (14 - half_exp); -# // Check for rounding (see comment above functions) -# let round_bit = 1 << (13 - half_exp); -# if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 { -# half_man += 1; -# } -# // No exponent for subnormals -# return (half_sign | half_man) as u16; -# } -# -# // Rebias the exponent -# let half_exp = (half_exp as u32) << 10; -# let half_man = man >> 13; -# // Check for rounding (see comment above functions) -# let round_bit = 0x0000_1000u32; -# if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 { -# // Round it -# ((half_sign | half_exp | half_man) + 1) as u16 -# } else { -# (half_sign | half_exp | half_man) as u16 -# } + # Unbias the exponent, then bias for half precision + half_exp = (exp.to_i64 - 127 + 15).to_i16 + # puts " exp: #{typeof(exp)} -> #{exp}" + # puts "half_exp: #{typeof(half_exp)} -> #{half_exp}" + + # Check for exponent overflow, return +infinity + if half_exp >= 0x1F + final_value = (half_sign | 0x7C00).to_u16 + puts "#{get_summary final_value} => overflow, return ± inf" + return final_value + end + + # Check for underflow + if half_exp <= 0 + # Check mantissa for what we can do + if 14 - half_exp > 24 + # No rounding possibility, so this is a full underflow, return signed zero + puts "#{get_summary half_sign.to_u16} => full underflow" + return half_sign.to_u16 + end + + # Don't forget about hidden leading mantissa bit when assembling mantissa + man = man | 0x0080_0000 + half_man = man >> (14 - half_exp) + + pp! binary_mantisse_f32(man), binary_mantisse_f16(half_man) + + # Check for rounding (see comment above functions) + round_bit = 1 << (13 - half_exp) + if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 + half_man += 1 + end + + # No exponent for subnormals + final_value = (half_sign | half_man).to_u16 + + puts "#{get_summary final_value} => underflow" + + return final_value + end + + # Rebias the exponent + half_exp = (half_exp) << 10 + half_man = (man >> 13) & 0x03FF + # puts " man: #{binary_mantisse_f32(man)}" + # puts "half_man: #{binary_mantisse_f16(half_man)}" + # Check for rounding (see comment above functions) + round_bit = 0x0000_1000u32 + final_value = if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 + # Round it + ((half_sign | half_exp | half_man) + 1).to_u16 + else + (half_sign | half_exp | half_man).to_u16 + end + + puts "#{get_summary final_value}" + final_value end