diff --git a/float16.cr b/float16.cr index b7fc90c..2cb9472 100644 --- a/float16.cr +++ b/float16.cr @@ -1,14 +1,24 @@ -f32_to_f16 1.5 -f32_to_f16 50.5 -f32_to_f16 0.0 -f32_to_f16 -1.0 -f32_to_f16 0.15625 -f32_to_f16 (1_f32 / 0_f32).as(Float32) -f32_to_f16 -(1_f32 / 0_f32).as(Float32) -f32_to_f16 100_000.0 -f32_to_f16 -100_000.0 -f32_to_f16 10.539187535151395835581398159855 -f32_to_f16 -10.539187535151395835581398159855 +print_result 0.0 +print_result -0.0 +print_result 1.5 +print_result -1.5 +print_result (1_f32 / 0_f32).as(Float32) +print_result -(1_f32 / 0_f32).as(Float32) +print_result (0_f32 / 0_f32).as(Float32) +print_result -(0_f32 / 0_f32).as(Float32) +print_result 65504.0 +print_result 10_000.0 +print_result -10_000.0 +# print_result 50.5 +# print_result 0.15625 +# print_result 10.539187 +# print_result -10.539187 + +def print_result(value : Float32) + final_value, status = f32_to_f16 value + puts "#{get_summary value}" + puts "#{get_summary final_value} => status: #{status}" +end def binary_8(v) sprintf "%08b", v & 0xFF @@ -66,8 +76,8 @@ def get_summary(value : Float32) str_value = "%15.6f" % value str_sign = binary_8(sign)[-1] - str_exp = "%15s" % binary_8(exp) - str_man = "%30s" % binary_mantisse_f32(man) + str_exp = "%10s" % binary_8(exp) + str_man = "%28s" % binary_mantisse_f32(man) "32-bit: #{str_value} => #{str_sign} #{str_exp} #{str_man}" end @@ -81,19 +91,28 @@ def get_summary(value : UInt16) # 5-bit value exp = (buffer[0].to_u32 & 0x7F) >> 2 # 23-bit value - man = ((buffer[0].to_u32 & 0x03) << 6) | (buffer[1].to_u32 << 8) + man = ((buffer[0].to_u32 & 0x03) << 8) | buffer[1].to_u32 str_value = "%15d" % value str_sign = binary_8(sign)[-1] # 1-bit value - str_exp = "%15s" % binary_8(exp)[3..7] # 5-bit value - str_man = "%30s" % binary_mantisse_f16(man) # 10-bit value + str_exp = "%10s" % binary_8(exp)[3..7] # 5-bit value + str_man = "%28s" % binary_mantisse_f16(man) # 10-bit value "16-bit: #{str_value} => #{str_sign} #{str_exp} #{str_man}" end -def f32_to_f16(value : Float32) +enum ConversionInfo + OK + NotANumber + Overflow + Underflow + FullUnderflow + Infinite + NegativeInfinite +end + +def f32_to_f16(value : Float32) - puts get_summary value buffer = get_buffer value # 0 or 1 @@ -101,15 +120,22 @@ def f32_to_f16(value : Float32) # 8-bit value exp = ((buffer[0].to_u32 & 0x7F) << 1) | (buffer[1].to_u32 >> 7) # 23-bit value - man = (buffer[1].to_u32 << 16) | (buffer[2].to_u32 << 8) | buffer[3].to_u32 + man = ((buffer[1].to_u32 & 0x7F) << 16) | (buffer[2].to_u32 << 8) | buffer[3].to_u32 # Check for all exponent bits being set, which is Infinity or NaN if exp == 0xFF # Set mantissa MSB for NaN (and also keep shifted mantissa bits) nan_bit = man == 0 ? 0 : 0x0200 - float16_value = (((sign << 15) | 0x7C00 | nan_bit | man) & 0xFFFF).to_u16 - puts "#{get_summary float16_value} => inf or nan" - return float16_value + final_value = (((sign << 15) | 0x7C00 | nan_bit | man) & 0xFFFF).to_u16 + # puts "#{get_summary final_value} => inf or nan" + conversion_info = if nan_bit != 0 + ConversionInfo::NotANumber + elsif sign == 0 + ConversionInfo::Infinite + else + ConversionInfo::NegativeInfinite + end + return final_value, conversion_info end # The number is normalized, start assembling half precision version @@ -123,8 +149,8 @@ def f32_to_f16(value : Float32) # Check for exponent overflow, return +infinity if half_exp >= 0x1F final_value = (half_sign | 0x7C00).to_u16 - puts "#{get_summary final_value} => overflow, return ± inf" - return final_value + # puts "#{get_summary final_value} => overflow, return ± inf" + return final_value, ConversionInfo::Overflow end # Check for underflow @@ -132,8 +158,8 @@ def f32_to_f16(value : Float32) # Check mantissa for what we can do if 14 - half_exp > 24 # No rounding possibility, so this is a full underflow, return signed zero - puts "#{get_summary half_sign.to_u16} => full underflow" - return half_sign.to_u16 + # puts "#{get_summary half_sign.to_u16} => full underflow" + return half_sign.to_u16, ConversionInfo::FullUnderflow end # Don't forget about hidden leading mantissa bit when assembling mantissa @@ -153,7 +179,7 @@ def f32_to_f16(value : Float32) puts "#{get_summary final_value} => underflow" - return final_value + return final_value, ConversionInfo::Underflow end # Rebias the exponent @@ -164,12 +190,15 @@ def f32_to_f16(value : Float32) # Check for rounding (see comment above functions) round_bit = 0x0000_1000u32 final_value = if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 + puts "round it" # Round it ((half_sign | half_exp | half_man) + 1).to_u16 else + v = (half_sign | half_exp | half_man) + # puts "NOT round it #{binary_32(v)}" + # puts "NOT round it #{binary_16(v.to_u16)}" (half_sign | half_exp | half_man).to_u16 end - puts "#{get_summary final_value}" - final_value + return final_value, ConversionInfo::OK end