print_result 0.0 print_result -0.0 print_result 1.5 print_result -1.5 print_result (1_f32 / 0_f32).as(Float32) print_result -(1_f32 / 0_f32).as(Float32) print_result (0_f32 / 0_f32).as(Float32) print_result -(0_f32 / 0_f32).as(Float32) print_result 65504.0 print_result 10_000.0 print_result -10_000.0 print_result 153.0 print_result -3992.0 # print_result 0.15625 # print_result 10.539187 # print_result -10.539187 def print_result(value : Float32) final_value, status = f32_to_f16 value puts "#{get_summary value}" puts "#{get_summary final_value} => status: #{status}" end def binary_8(v) sprintf "%08b", v & 0xFF end def binary_16(v) sprintf "%08b %08b", (v >> 8) & 0xFF, v & 0xFF end def binary_24(v) sprintf "%08b %08b %08b", (v >> 16) & 0xFF, (v >> 8) & 0xFF, v & 0xFF end def binary_32(v) sprintf "%08b %08b %08b %08b", (v >> 24) & 0xFF, (v >> 16) & 0xFF, (v >> 8) & 0xFF, v & 0xFF end def binary_mantisse_f32(v) binary_24(v)[1..] # mantisse only is 23-bit, remove the first represented bit end def binary_mantisse_f16(v) binary_16(v)[6..] # mantisse only is 10-bit, remove the first represented bits end def get_buffer(value : UInt16) [ ((value >> 8) & 0xFF).to_u8, (value & 0xFF).to_u8 ] end def get_summary(value : Float32) buffer = get_buffer value # 0 or 1 sign = (buffer[0].to_u32 >> 7) # 8-bit value exp = ((buffer[0].to_u32 & 0x7F) << 1) | (buffer[1].to_u32 >> 7) # 23-bit value man = (buffer[1].to_u32 << 16) | (buffer[2].to_u32 << 8) | buffer[3].to_u32 str_value = "%15.6f" % value str_sign = binary_8(sign)[-1] str_exp = "%10s" % binary_8(exp) str_man = "%28s" % binary_mantisse_f32(man) "32-bit: #{str_value} => #{str_sign} #{str_exp} #{str_man}" end # Float16 in a UInt16 value def get_summary(value : UInt16) buffer = get_buffer value # 1-bit value sign = (buffer[0].to_u32 >> 7) # 5-bit value exp = (buffer[0].to_u32 & 0x7F) >> 2 # 23-bit value man = ((buffer[0].to_u32 & 0x03) << 8) | buffer[1].to_u32 str_value = "%15d" % value str_sign = binary_8(sign)[-1] # 1-bit value str_exp = "%10s" % binary_8(exp)[3..7] # 5-bit value str_man = "%28s" % binary_mantisse_f16(man) # 10-bit value "16-bit: #{str_value} => #{str_sign} #{str_exp} #{str_man}" end def get_buffer(value : Float32) # TODO: is there a simpler way to perform binary operations over a float? # Extract IEEE754 components io = IO::Memory.new io.write_bytes(value, IO::ByteFormat::NetworkEndian) io.rewind v = io.gets(4) if v.nil? raise "cannot perform f32 to f16 on value #{value}" end v.to_slice end enum ConversionInfo OK NotANumber Overflow Underflow FullUnderflow Infinite NegativeInfinite end def f32_to_f16(value : Float32) buffer = get_buffer value # 0 or 1 sign = (buffer[0].to_u32 >> 7) # 8-bit value exp = ((buffer[0].to_u32 & 0x7F) << 1) | (buffer[1].to_u32 >> 7) # 23-bit value man = ((buffer[1].to_u32 & 0x7F) << 16) | (buffer[2].to_u32 << 8) | buffer[3].to_u32 # Check for all exponent bits being set, which is Infinity or NaN if exp == 0xFF # Set mantissa MSB for NaN (and also keep shifted mantissa bits) nan_bit = man == 0 ? 0 : 0x0200 final_value = (((sign << 15) | 0x7C00 | nan_bit | man) & 0xFFFF).to_u16 conversion_info = if nan_bit != 0 ConversionInfo::NotANumber elsif sign == 0 ConversionInfo::Infinite else ConversionInfo::NegativeInfinite end return final_value, conversion_info end # The number is normalized, start assembling half precision version half_sign = sign << 15 # Unbias the exponent, then bias for half precision half_exp = (exp.to_i64 - 127 + 15).to_i16 # Check for exponent overflow, return +infinity if half_exp >= 0x1F final_value = (half_sign | 0x7C00).to_u16 return final_value, ConversionInfo::Overflow end # Check for underflow if half_exp <= 0 # Check mantissa for what we can do if 14 - half_exp > 24 # No rounding possibility, so this is a full underflow, return signed zero return half_sign.to_u16, ConversionInfo::FullUnderflow end # Don't forget about hidden leading mantissa bit when assembling mantissa man = man | 0x0080_0000 half_man = man >> (14 - half_exp) # Check for rounding (see comment above functions) round_bit = 1 << (13 - half_exp) if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 half_man += 1 end # No exponent for subnormals final_value = (half_sign | half_man).to_u16 return final_value, ConversionInfo::Underflow end # Rebias the exponent half_exp = (half_exp) << 10 half_man = (man >> 13) & 0x03FF # Check for rounding (see comment above functions) round_bit = 0x0000_1000u32 final_value = if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 # Round it ((half_sign | half_exp | half_man) + 1).to_u16 else v = (half_sign | half_exp | half_man) (half_sign | half_exp | half_man).to_u16 end return final_value, ConversionInfo::OK end