Seems OK-ish
parent
16a839666a
commit
cd44c4a724
204
float16.cr
204
float16.cr
|
@ -4,57 +4,42 @@ f32_to_f16 0.0
|
||||||
f32_to_f16 -1.0
|
f32_to_f16 -1.0
|
||||||
f32_to_f16 0.15625
|
f32_to_f16 0.15625
|
||||||
f32_to_f16 (1_f32 / 0_f32).as(Float32)
|
f32_to_f16 (1_f32 / 0_f32).as(Float32)
|
||||||
|
f32_to_f16 -(1_f32 / 0_f32).as(Float32)
|
||||||
|
f32_to_f16 100_000.0
|
||||||
|
f32_to_f16 -100_000.0
|
||||||
|
f32_to_f16 10.539187535151395835581398159855
|
||||||
|
f32_to_f16 -10.539187535151395835581398159855
|
||||||
|
|
||||||
def binary_8(v)
|
def binary_8(v)
|
||||||
sprintf "%08b", v & 0xFF
|
sprintf "%08b", v & 0xFF
|
||||||
end
|
end
|
||||||
|
|
||||||
def binary_16(v)
|
def binary_16(v)
|
||||||
sprintf "%08b %08b",
|
sprintf "%08b %08b", (v >> 8) & 0xFF, v & 0xFF
|
||||||
(v >> 8) & 0xFF,
|
|
||||||
v & 0xFF
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def binary_24(v)
|
def binary_24(v)
|
||||||
sprintf "%08b %08b %08b",
|
sprintf "%08b %08b %08b", (v >> 16) & 0xFF, (v >> 8) & 0xFF, v & 0xFF
|
||||||
(v >> 16) & 0xFF,
|
|
||||||
(v >> 8) & 0xFF,
|
|
||||||
v & 0xFF
|
|
||||||
end
|
|
||||||
|
|
||||||
def binary_mantisse(v)
|
|
||||||
binary_24(v)[1..]
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def binary_32(v)
|
def binary_32(v)
|
||||||
sprintf "%08b %08b %08b %08b",
|
sprintf "%08b %08b %08b %08b", (v >> 24) & 0xFF, (v >> 16) & 0xFF, (v >> 8) & 0xFF, v & 0xFF
|
||||||
(v >> 24) & 0xFF,
|
|
||||||
(v >> 16) & 0xFF,
|
|
||||||
(v >> 8) & 0xFF,
|
|
||||||
v & 0xFF
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def print_summary(value : Float32)
|
def binary_mantisse_f32(v)
|
||||||
|
binary_24(v)[1..] # mantisse only is 23-bit, remove the first represented bit
|
||||||
# 0 or 1
|
|
||||||
sign = (buffer[0].to_u32 >> 7)
|
|
||||||
# 8-bit value
|
|
||||||
exp = ((buffer[0].to_u32 & 0x7F) << 1) | (buffer[1].to_u32 >> 7)
|
|
||||||
# 23-bit value
|
|
||||||
man = (buffer[1].to_u32 << 16) | (buffer[2].to_u32 << 8) | buffer[3].to_u32
|
|
||||||
|
|
||||||
str_value = "%10.6f" % value
|
|
||||||
str_sign = binary_8(sign)[-1]
|
|
||||||
str_exp = binary_8(exp)
|
|
||||||
str_man = binary_mantisse(man)
|
|
||||||
|
|
||||||
puts "#{str_value} => #{str_sign} #{str_exp} #{str_man}"
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def print_summary(value : Float32)
|
def binary_mantisse_f16(v)
|
||||||
|
binary_16(v)[6..] # mantisse only is 10-bit, remove the first represented bits
|
||||||
end
|
end
|
||||||
|
|
||||||
def f32_to_f16(value : Float32)
|
|
||||||
|
def get_buffer(value : UInt16)
|
||||||
|
[ ((value >> 8) & 0xFF).to_u8, (value & 0xFF).to_u8 ]
|
||||||
|
end
|
||||||
|
|
||||||
|
def get_buffer(value : Float32)
|
||||||
# TODO: is there a simpler way to perform binary operations over a float?
|
# TODO: is there a simpler way to perform binary operations over a float?
|
||||||
# Extract IEEE754 components
|
# Extract IEEE754 components
|
||||||
io = IO::Memory.new
|
io = IO::Memory.new
|
||||||
|
@ -66,7 +51,11 @@ def f32_to_f16(value : Float32)
|
||||||
raise "cannot perform f32 to f16 on value #{value}"
|
raise "cannot perform f32 to f16 on value #{value}"
|
||||||
end
|
end
|
||||||
|
|
||||||
buffer = v.to_slice
|
v.to_slice
|
||||||
|
end
|
||||||
|
|
||||||
|
def get_summary(value : Float32)
|
||||||
|
buffer = get_buffer value
|
||||||
|
|
||||||
# 0 or 1
|
# 0 or 1
|
||||||
sign = (buffer[0].to_u32 >> 7)
|
sign = (buffer[0].to_u32 >> 7)
|
||||||
|
@ -75,61 +64,112 @@ def f32_to_f16(value : Float32)
|
||||||
# 23-bit value
|
# 23-bit value
|
||||||
man = (buffer[1].to_u32 << 16) | (buffer[2].to_u32 << 8) | buffer[3].to_u32
|
man = (buffer[1].to_u32 << 16) | (buffer[2].to_u32 << 8) | buffer[3].to_u32
|
||||||
|
|
||||||
print_summary value, buffer
|
str_value = "%15.6f" % value
|
||||||
|
str_sign = binary_8(sign)[-1]
|
||||||
|
str_exp = "%15s" % binary_8(exp)
|
||||||
|
str_man = "%30s" % binary_mantisse_f32(man)
|
||||||
|
|
||||||
|
"32-bit: #{str_value} => #{str_sign} #{str_exp} #{str_man}"
|
||||||
|
end
|
||||||
|
|
||||||
|
# Float16 in a UInt16 value
|
||||||
|
def get_summary(value : UInt16)
|
||||||
|
buffer = get_buffer value
|
||||||
|
|
||||||
|
# 1-bit value
|
||||||
|
sign = (buffer[0].to_u32 >> 7)
|
||||||
|
# 5-bit value
|
||||||
|
exp = (buffer[0].to_u32 & 0x7F) >> 2
|
||||||
|
# 23-bit value
|
||||||
|
man = ((buffer[0].to_u32 & 0x03) << 6) | (buffer[1].to_u32 << 8)
|
||||||
|
|
||||||
|
str_value = "%15d" % value
|
||||||
|
str_sign = binary_8(sign)[-1] # 1-bit value
|
||||||
|
str_exp = "%15s" % binary_8(exp)[3..7] # 5-bit value
|
||||||
|
str_man = "%30s" % binary_mantisse_f16(man) # 10-bit value
|
||||||
|
|
||||||
|
"16-bit: #{str_value} => #{str_sign} #{str_exp} #{str_man}"
|
||||||
|
end
|
||||||
|
|
||||||
|
def f32_to_f16(value : Float32)
|
||||||
|
|
||||||
|
puts get_summary value
|
||||||
|
buffer = get_buffer value
|
||||||
|
|
||||||
|
# 0 or 1
|
||||||
|
sign = (buffer[0].to_u32 >> 7)
|
||||||
|
# 8-bit value
|
||||||
|
exp = ((buffer[0].to_u32 & 0x7F) << 1) | (buffer[1].to_u32 >> 7)
|
||||||
|
# 23-bit value
|
||||||
|
man = (buffer[1].to_u32 << 16) | (buffer[2].to_u32 << 8) | buffer[3].to_u32
|
||||||
|
|
||||||
# Check for all exponent bits being set, which is Infinity or NaN
|
# Check for all exponent bits being set, which is Infinity or NaN
|
||||||
if exp == 0xFF
|
if exp == 0xFF
|
||||||
puts "exp == 0xFF"
|
|
||||||
# Set mantissa MSB for NaN (and also keep shifted mantissa bits)
|
# Set mantissa MSB for NaN (and also keep shifted mantissa bits)
|
||||||
nan_bit = man == 0 ? 0 : 0x0200
|
nan_bit = man == 0 ? 0 : 0x0200
|
||||||
pp! binary_24(nan_bit)
|
float16_value = (((sign << 15) | 0x7C00 | nan_bit | man) & 0xFFFF).to_u16
|
||||||
float16_value = ((sign << 15) | 0x7C00 | nan_bit | man) & 0xFFFF
|
puts "#{get_summary float16_value} => inf or nan"
|
||||||
f16_value = sprintf "%08b %08b", float16_value >> 8, float16_value & 0xFF
|
|
||||||
pp! f16_value
|
|
||||||
return float16_value
|
return float16_value
|
||||||
end
|
end
|
||||||
|
|
||||||
return 0
|
# The number is normalized, start assembling half precision version
|
||||||
|
half_sign = sign << 15
|
||||||
|
|
||||||
# // The number is normalized, start assembling half precision version
|
# Unbias the exponent, then bias for half precision
|
||||||
# let half_sign = sign >> 16;
|
half_exp = (exp.to_i64 - 127 + 15).to_i16
|
||||||
# // Unbias the exponent, then bias for half precision
|
# puts " exp: #{typeof(exp)} -> #{exp}"
|
||||||
# let unbiased_exp = ((exp >> 23) as i32) - 127;
|
# puts "half_exp: #{typeof(half_exp)} -> #{half_exp}"
|
||||||
# let half_exp = unbiased_exp + 15;
|
|
||||||
#
|
# Check for exponent overflow, return +infinity
|
||||||
# // Check for exponent overflow, return +infinity
|
if half_exp >= 0x1F
|
||||||
# if half_exp >= 0x1F {
|
final_value = (half_sign | 0x7C00).to_u16
|
||||||
# return (half_sign | 0x7C00u32) as u16;
|
puts "#{get_summary final_value} => overflow, return ± inf"
|
||||||
# }
|
return final_value
|
||||||
#
|
end
|
||||||
# // Check for underflow
|
|
||||||
# if half_exp <= 0 {
|
# Check for underflow
|
||||||
# // Check mantissa for what we can do
|
if half_exp <= 0
|
||||||
# if 14 - half_exp > 24 {
|
# Check mantissa for what we can do
|
||||||
# // No rounding possibility, so this is a full underflow, return signed zero
|
if 14 - half_exp > 24
|
||||||
# return half_sign as u16;
|
# No rounding possibility, so this is a full underflow, return signed zero
|
||||||
# }
|
puts "#{get_summary half_sign.to_u16} => full underflow"
|
||||||
# // Don't forget about hidden leading mantissa bit when assembling mantissa
|
return half_sign.to_u16
|
||||||
# let man = man | 0x0080_0000u32;
|
end
|
||||||
# let mut half_man = man >> (14 - half_exp);
|
|
||||||
# // Check for rounding (see comment above functions)
|
# Don't forget about hidden leading mantissa bit when assembling mantissa
|
||||||
# let round_bit = 1 << (13 - half_exp);
|
man = man | 0x0080_0000
|
||||||
# if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
|
half_man = man >> (14 - half_exp)
|
||||||
# half_man += 1;
|
|
||||||
# }
|
pp! binary_mantisse_f32(man), binary_mantisse_f16(half_man)
|
||||||
# // No exponent for subnormals
|
|
||||||
# return (half_sign | half_man) as u16;
|
# Check for rounding (see comment above functions)
|
||||||
# }
|
round_bit = 1 << (13 - half_exp)
|
||||||
#
|
if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0
|
||||||
# // Rebias the exponent
|
half_man += 1
|
||||||
# let half_exp = (half_exp as u32) << 10;
|
end
|
||||||
# let half_man = man >> 13;
|
|
||||||
# // Check for rounding (see comment above functions)
|
# No exponent for subnormals
|
||||||
# let round_bit = 0x0000_1000u32;
|
final_value = (half_sign | half_man).to_u16
|
||||||
# if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0 {
|
|
||||||
# // Round it
|
puts "#{get_summary final_value} => underflow"
|
||||||
# ((half_sign | half_exp | half_man) + 1) as u16
|
|
||||||
# } else {
|
return final_value
|
||||||
# (half_sign | half_exp | half_man) as u16
|
end
|
||||||
# }
|
|
||||||
|
# Rebias the exponent
|
||||||
|
half_exp = (half_exp) << 10
|
||||||
|
half_man = (man >> 13) & 0x03FF
|
||||||
|
# puts " man: #{binary_mantisse_f32(man)}"
|
||||||
|
# puts "half_man: #{binary_mantisse_f16(half_man)}"
|
||||||
|
# Check for rounding (see comment above functions)
|
||||||
|
round_bit = 0x0000_1000u32
|
||||||
|
final_value = if (man & round_bit) != 0 && (man & (3 * round_bit - 1)) != 0
|
||||||
|
# Round it
|
||||||
|
((half_sign | half_exp | half_man) + 1).to_u16
|
||||||
|
else
|
||||||
|
(half_sign | half_exp | half_man).to_u16
|
||||||
|
end
|
||||||
|
|
||||||
|
puts "#{get_summary final_value}"
|
||||||
|
final_value
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue