From 2e2edd19087cc9dcd24b25420b29ecc62e6e1067 Mon Sep 17 00:00:00 2001 From: Alberto Restifo Date: Mon, 20 Apr 2020 14:57:20 +0200 Subject: [PATCH] Work on lexer and diagnostic representation --- .build.yml | 2 +- spec/cbor/lexer_spec.cr | 2 +- spec/rfc_spec.cr | 103 +++++++++++++++++++++++++++++++++++ spec/rfc_tests.cr | 84 ---------------------------- src/cbor/decoder.cr | 21 +++++++ src/cbor/diagnostic.cr | 50 +++++++++++++---- src/cbor/lexer.cr | 52 ++++++------------ src/cbor/token.cr | 41 ++++++++++++-- src/cbor/type/bytes_array.cr | 1 + 9 files changed, 220 insertions(+), 136 deletions(-) create mode 100644 spec/rfc_spec.cr delete mode 100644 spec/rfc_tests.cr diff --git a/.build.yml b/.build.yml index b542f83..e6d0abf 100644 --- a/.build.yml +++ b/.build.yml @@ -12,4 +12,4 @@ tasks: crystal tool format --check - test: | cd crystal-cbor - crystal spec + crystal spec --error-on-warnings diff --git a/spec/cbor/lexer_spec.cr b/spec/cbor/lexer_spec.cr index ad9eb00..c544e33 100644 --- a/spec/cbor/lexer_spec.cr +++ b/spec/cbor/lexer_spec.cr @@ -24,7 +24,7 @@ describe CBOR::Lexer do it "reads #{tt[:bytes].hexstring} as #{tt[:value].to_s}" do lexer = CBOR::Lexer.new(tt[:bytes]) - token = lexer.read_token + token = lexer.next_token token.should be_a(CBOR::Token::IntT) token.as(CBOR::Token::IntT).value.should eq(tt[:value]) end diff --git a/spec/rfc_spec.cr b/spec/rfc_spec.cr new file mode 100644 index 0000000..ed8c4ee --- /dev/null +++ b/spec/rfc_spec.cr @@ -0,0 +1,103 @@ +require "./spec_helper" + +tests = [ + { %(0), "00" }, + { %(1), "01" }, + { %(10), "0a" }, + { %(23), "17" }, + { %(24), "18 18" }, + { %(25), "18 19" }, + { %(100), "18 64" }, + { %(1000), "19 03 e8" }, + { %(1000000), "1a 00 0f 42 40" }, + { %(1000000000000), "1b 00 00 00 e8 d4 a5 10 00" }, + { %(18446744073709551615), "1b ff ff ff ff ff ff ff ff" }, + # { %(18446744073709551616), "c2 49 01 00 00 00 00 00 00 00 00" }, + { %(-18446744073709551616), "3b ff ff ff ff ff ff ff ff" }, + # { %(-18446744073709551617), "c3 49 01 00 00 00 00 00 00 00 00" }, + { %(-1), "20" }, + { %(-10), "29" }, + { %(-100), "38 63" }, + { %(-1000), "39 03 e7" }, + # { %(0.0), "f9 00 00" }, + # { %(-0.0), "f9 80 00" }, + # { %(1.0), "f9 3c 00" }, + # { %(1.1), "fb 3f f1 99 99 99 99 99 9a" }, + # { %(1.5), "f9 3e 00" }, + # { %(65504.0), "f9 7b ff" }, + # { %(100000.0), "fa 47 c3 50 00" }, + # { %(3.4028234663852886e+38), "fa 7f 7f ff ff" }, + # { %(1.0e+300), "fb 7e 37 e4 3c 88 00 75 9c" }, + # { %(5.960464477539063e-8), "f9 00 01" }, + # { %(0.00006103515625), "f9 04 00" }, + # { %(-4.0), "f9 c4 00" }, + # { %(-4.1), "fb c0 10 66 66 66 66 66 66" }, + # { %(Infinity), "f9 7c 00" }, + # { %(NaN), "f9 7e 00" }, + # { %(-Infinity), "f9 fc 00" }, + # { %(Infinity), "fa 7f 80 00 00" }, + # { %(NaN), "fa 7f c0 00 00" }, + # { %(-Infinity), "fa ff 80 00 00" }, + # { %(Infinity), "fb 7f f0 00 00 00 00 00 00" }, + # { %(NaN), "fb 7f f8 00 00 00 00 00 00" }, + # { %(-Infinity), "fb ff f0 00 00 00 00 00 00" }, + # { %(false), "f4" }, + # { %(true), "f5" }, + # { %(null), "f6" }, + # { %(undefined), "f7" }, + # { %(simple(16)), "f0" }, + # { %(simple(24)), "f8 18" }, + # { %(simple(255)), "f8 ff" }, + # { %(0("2013-03-21T20:04:00Z")), "c0 74 32 30 31 33 2d 30 33 2d 32 31 54 32 30 3a 30 34 3a 30 30 5a" }, + # { %(1(1363896240)), "c1 1a 51 4b 67 b0" }, + # { %(1(1363896240.5)), "c1 fb 41 d4 52 d9 ec 20 00 00" }, + # { %(23(h'01020304')), "d7 44 01 02 03 04" }, + # { %(24(h'6449455446')), "d8 18 45 64 49 45 54 46" }, + # { %(32("http://www.example.com")), "d8 20 76 68 74 74 70 3a 2f 2f 77 77 77 2e 65 78 61 6d 70 6c 65 2e 63 6f 6d" }, + { %(h''), "40" }, + { %(h'01020304'), "44 01 02 03 04" }, + # { %(""), "60" }, + # { %("a"), "61 61" }, + # { %("IETF"), "64 49 45 54 46" }, + # { %(""\\"), "62 22 5c" }, + # { %("\u00fc"), "62 c3 bc" }, + # { %("\u6c34"), "63 e6 b0 b4" }, + # { %("\ud800\udd51"), "64 f0 90 85 91" }, + # { %([]), "80" }, + # { %([1, 2, 3]), "83 01 02 03" }, + # { %([1, [2, 3], [4, 5]]), "83 01 82 02 03 82 04 05" }, + # { %([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), "98 19 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 14 15 16 17 18 18 18 19" }, + # { %({}), "a0" }, + # { %({1: 2, 3: 4}), "a2 01 02 03 04" }, + # { %({"a": 1, "b": [2, 3]}), "a2 61 61 01 61 62 82 02 03" }, + # { %(["a", {"b": "c"}]), "82 61 61 a1 61 62 61 63" }, + # { %({"a": "A", "b": "B", "c": "C", "d": "D", "e": "E"}), "a5 61 61 61 41 61 62 61 42 61 63 61 43 61 64 61 44 61 65 61 45" }, + { %((_ h'0102', h'030405')), "5f 42 01 02 43 03 04 05 ff" }, + # { %((_ "strea", "ming")), "7f 65 73 74 72 65 61 64 6d 69 6e 67 ff" }, + # { %([_ ]), "9f ff" }, + # { %([_ 1, [2, 3], [_ 4, 5]]), "9f 01 82 02 03 9f 04 05 ff ff" }, + # { %([_ 1, [2, 3], [4, 5]]), "9f 01 82 02 03 82 04 05 ff" }, + # { %([1, [2, 3], [_ 4, 5]]), "83 01 82 02 03 9f 04 05 ff" }, + # { %([1, [_ 2, 3], [4, 5]]), "83 01 9f 02 03 ff 82 04 05" }, + # { %([_ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), "9f 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 14 15 16 17 18 18 18 19 ff" }, + # { %({_ "a": 1, "b": [_ 2, 3]}), "bf 61 61 01 61 62 9f 02 03 ff ff" }, + # { %(["a", {_ "b": "c"}]), "82 61 61 bf 61 62 61 63 ff" }, + # { %({_ "Fun": true, "Amt": -2}), "bf 63 46 75 6e f5 63 41 6d 74 21 ff" }, +] + +describe "Examples from RFC7049 Appendix A" do + tests.each_with_index do |tt, index| + describe "test ##{index}" do + diagnostic, hex_string = tt + + bytes_arr = hex_string.split.map(&.to_u8(16)) + bytes = Bytes.new(bytes_arr.to_unsafe, bytes_arr.size) + + it "reads #{bytes.hexstring} as #{diagnostic}" do + result = CBOR::Diagnostic.new(bytes).to_s + + result.should eq(diagnostic) + end + end + end +end diff --git a/spec/rfc_tests.cr b/spec/rfc_tests.cr deleted file mode 100644 index 919f256..0000000 --- a/spec/rfc_tests.cr +++ /dev/null @@ -1,84 +0,0 @@ -tests = [ - { %(0), "00" }, - { %(1), "01" }, - { %(10), "0a" }, - { %(23), "17" }, - { %(24), "18 18" }, - { %(25), "18 19" }, - { %(100), "18 64" }, - { %(1000), "19 03 e8" }, - { %(1000000), "1a 00 0f 42 40" }, - { %(1000000000000), "1b 00 00 00 e8 d4 a5 10 00" }, - { %(18446744073709551615), "1b ff ff ff ff ff ff ff ff" }, - { %(18446744073709551616), "c2 49 01 00 00 00 00 00 00 00 00" }, - { %(-18446744073709551616), "3b ff ff ff ff ff ff ff ff" }, - { %(-18446744073709551617), "c3 49 01 00 00 00 00 00 00 00 00" }, - { %(-1), "20" }, - { %(-10), "29" }, - { %(-100), "38 63" }, - { %(-1000), "39 03 e7" }, - { %(0.0), "f9 00 00" }, - { %(-0.0), "f9 80 00" }, - { %(1.0), "f9 3c 00" }, - { %(1.1), "fb 3f f1 99 99 99 99 99 9a" }, - { %(1.5), "f9 3e 00" }, - { %(65504.0), "f9 7b ff" }, - { %(100000.0), "fa 47 c3 50 00" }, - { %(3.4028234663852886e+38), "fa 7f 7f ff ff" }, - { %(1.0e+300), "fb 7e 37 e4 3c 88 00 75 9c" }, - { %(5.960464477539063e-8), "f9 00 01" }, - { %(0.00006103515625), "f9 04 00" }, - { %(-4.0), "f9 c4 00" }, - { %(-4.1), "fb c0 10 66 66 66 66 66 66" }, - { %(Infinity), "f9 7c 00" }, - { %(NaN), "f9 7e 00" }, - { %(-Infinity), "f9 fc 00" }, - { %(Infinity), "fa 7f 80 00 00" }, - { %(NaN), "fa 7f c0 00 00" }, - { %(-Infinity), "fa ff 80 00 00" }, - { %(Infinity), "fb 7f f0 00 00 00 00 00 00" }, - { %(NaN), "fb 7f f8 00 00 00 00 00 00" }, - { %(-Infinity), "fb ff f0 00 00 00 00 00 00" }, - { %(false), "f4" }, - { %(true), "f5" }, - { %(null), "f6" }, - { %(undefined), "f7" }, - { %(simple(16)), "f0" }, - { %(simple(24)), "f8 18" }, - { %(simple(255)), "f8 ff" }, - { %(0("2013-03-21T20:04:00Z")), "c0 74 32 30 31 33 2d 30 33 2d 32 31 54 32 30 3a 30 34 3a 30 30 5a" }, - { %(1(1363896240)), "c1 1a 51 4b 67 b0" }, - { %(1(1363896240.5)), "c1 fb 41 d4 52 d9 ec 20 00 00" }, - { %(23(h'01020304')), "d7 44 01 02 03 04" }, - { %(24(h'6449455446')), "d8 18 45 64 49 45 54 46" }, - { %(32("http://www.example.com")), "d8 20 76 68 74 74 70 3a 2f 2f 77 77 77 2e 65 78 61 6d 70 6c 65 2e 63 6f 6d" }, - { %(h''), "40" }, - { %(h'01020304'), "44 01 02 03 04" }, - { %(""), "60" }, - { %("a"), "61 61" }, - { %("IETF"), "64 49 45 54 46" }, - { %(""\\"), "62225c" }, - { %("\u00fc"), "62 c3 bc" }, - { %("\u6c34"), "63 e6 b0 b4" }, - { %("\ud800\udd51"), "64 f0 90 85 91" }, - { %([]), "80" }, - { %([1, 2, 3]), "83 01 02 03" }, - { %([1, [2, 3], [4, 5]]), "83 01 82 02 03 82 04 05" }, - { %([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), "98 19 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 14 15 16 17 18 18 18 19" }, - { %({}), "a0" }, - { %({1: 2, 3: 4}), "a2 01 02 03 04" }, - { %({"a": 1, "b": [2, 3]}), "a2 61 61 01 61 62 82 02 03" }, - { %(["a", {"b": "c"}]), "82 61 61 a1 61 62 61 63" }, - { %({"a": "A", "b": "B", "c": "C", "d": "D", "e": "E"}), "a5 61 61 61 41 61 62 61 42 61 63 61 43 61 64 61 44 61 65 61 45" }, - { %((_ h'0102', h'030405')), "5f 42 01 02 43 03 04 05 ff" }, - { %((_ "strea", "ming")), "7f 65 73 74 72 65 61 64 6d 69 6e 67 ff" }, - { %([_ ]), "9f ff" }, - { %([_ 1, [2, 3], [_ 4, 5]]), "9f 01 82 02 03 9f 04 05 ff ff" }, - { %([_ 1, [2, 3], [4, 5]]), "9f 01 82 02 03 82 04 05 ff" }, - { %([1, [2, 3], [_ 4, 5]]), "83 01 82 02 03 9f 04 05 ff" }, - { %([1, [_ 2, 3], [4, 5]]), "83 01 9f 02 03 ff 82 04 05" }, - { %([_ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), "9f 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 14 15 16 17 18 18 18 19 ff" }, - { %({_ "a": 1, "b": [_ 2, 3]}), "bf 61 61 01 61 62 9f 02 03 ff ff" }, - { %(["a", {_ "b": "c"}]), "82 61 61 bf 61 62 61 63 ff" }, - { %({_ "Fun": true, "Amt": -2}), "bf 63 46 75 6e f5 63 41 6d 74 21 ff" }, -] diff --git a/src/cbor/decoder.cr b/src/cbor/decoder.cr index 50ff8b4..87f2ad7 100644 --- a/src/cbor/decoder.cr +++ b/src/cbor/decoder.cr @@ -22,4 +22,25 @@ abstract class CBOR::Decoder # Consume the array :) end end + + private def read_bytes_array_body + read_type(Token::ByteArrayT) do |token| + end + end + + private macro read_type(type, finish_token = true, &block) + case token = current_token + when {{type}} + {% if finish_token %}finish_token!{% end %} + {{ block.body }} + else + unexpected_token(token, {{type.stringify.split("::").last}}) + end + end + + private def unexpected_token(token, expected = nil) + message = "Unexpected token #{Token.to_s(token)}" + message += " expected #{expected}" if expected + raise TypeCastError.new(message, token.byte_number) + end end diff --git a/src/cbor/diagnostic.cr b/src/cbor/diagnostic.cr index 5129130..58ed19b 100644 --- a/src/cbor/diagnostic.cr +++ b/src/cbor/diagnostic.cr @@ -1,15 +1,45 @@ -module CBOR::Diagnostic - def to_s(value : CBOR::ByteArray) : String - value.to_diagnostic +require "./lexer" +require "./token" + +# Reads a CBOR input into a diagnostic string. +# This consumes the IO and is mostly usedful to tests again the example +# provided in the RFC and ensuring a correct functioning of the `CBOR::Lexer`. +class CBOR::Diagnostic + @lexer : Lexer + @is_array : Bool = false + + def initialize(input) + @lexer = Lexer.new(input) end - {% for type in [UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64, Int64, Int128] %} - def to_s(value : {{type}}) : String - {{type}}.to_s - end - {% end %} + # Reads the content of the IO and prints out a diagnostic string + # represation of the input. + def to_s : String + result = "" - def to_s(value : String) - %("#{value}") + while val = next_value + result += val + end + + result + end + + private def next_value : String? + token = @lexer.next_token + return nil unless token + + case token + when Token::BytesArray + @is_array = true + when Token::BreakT + @is_array = flase + end + + separator + Token.to_diagnostic(token) + end + + private def separator : String + return ", " if @is_array + "" end end diff --git a/src/cbor/lexer.cr b/src/cbor/lexer.cr index e053ac4..d43dd20 100644 --- a/src/cbor/lexer.cr +++ b/src/cbor/lexer.cr @@ -11,44 +11,19 @@ class CBOR::Lexer new IO::Memory.new(slice) end - @token : Token::T @current_pos : Int64 - @token_finished : Bool + @eof : Bool = false def initialize(@io : IO) @current_pos = 0 - @token = Token::NullT.new(0) - @token_finished = true end - @[AlwaysInline] - def current_token : Token::T - if @token_finished - @token_finished = false - @token = next_token - else - @token - end - end + def next_token + return nil if @eof - @[AlwaysInline] - def finish_token! - @token_finished = true - end - - @[AlwaysInline] - def read_token : Token::T - if @token_finished - @token = next_token - else - finish_token! - end - @token - end - - private def next_token @current_pos = @io.pos.to_i64 - current_byte = next_byte + current_byte = @io.read_byte + return nil unless current_byte case current_byte when 0x00..0x17 @@ -85,16 +60,23 @@ class CBOR::Lexer when 0x5b consume_binary(read(UInt64)) when 0x5f - Token::BytesArrayT.new(@current_pos) + Token::BytesArrayStartT.new(@current_pos) + when 0xff + # TODO: Define which segment it's breaking + Token::BreakT.new(@current_pos) else - raise ParseError.new("Unexpected first byte #{current_byte}") + raise ParseError.new("Unexpected first byte 0x#{current_byte.to_s(16)}") end end - private def next_byte : UInt8 + private def next_byte : UInt8? byte = @io.read_byte - raise ParseError.new("Unexpected EOF at byte #{@io.pos}") unless byte - byte + if byte + byte + else + @eof = true + nil + end end private def consume_int(value) diff --git a/src/cbor/token.cr b/src/cbor/token.cr index 1247769..0ce1714 100644 --- a/src/cbor/token.cr +++ b/src/cbor/token.cr @@ -1,5 +1,6 @@ class CBOR::Token record NullT, byte_number : Int64 + record UndefinedT, byte_number : Int64 record BoolT, byte_number : Int64, value : Bool record ArrayT, byte_number : Int64, size : UInt32? record MapT, byte_number : Int64, size : UInt32? @@ -7,12 +8,26 @@ class CBOR::Token record FloatT, byte_number : Int64, value : Float64 record StringT, byte_number : Int64, value : String record BytesT, byte_number : Int64, value : Bytes - record StringArrayT, byte_number : Int64 - record BytesArrayT, byte_number : Int64 + record StringArrayStartT, byte_number : Int64 + record StringArrayEndT, byte_number : Int64 + record BytesArrayStartT, byte_number : Int64 + record BytesArrayEndT, byte_number : Int64 - alias T = NullT | BoolT | ArrayT | MapT | IntT | FloatT | StringT | BytesT | StringArrayT | BytesArrayT + alias T = NullT | + UndefinedT | + BoolT | + ArrayT | + MapT | + IntT | + FloatT | + StringT | + BytesT | + StringArrayStartT | + StringArrayEndT | + BytesArrayStartT | + BytesArrayEndT - def self.to_s(token : T) + def self.to_diagnostic(token : T) : String case token when IntT token.value.to_s @@ -23,8 +38,24 @@ class CBOR::Token "null" when UndefinedT "undefined" + when BoolT + token.value.to_s + when BytesArrayStartT + "(_ " + when BytesArrayEndT + ")" + when FloatT + "TODO" + when StringT + "TODO" + when StringArrayT + "TODO" + when MapT + "TODO" + when ArrayT + "TODO" else - raise "Diagnostic notation for type #{token.class} not implemented" + raise "Uknown diagnostics representation for #{token.class}" end end end diff --git a/src/cbor/type/bytes_array.cr b/src/cbor/type/bytes_array.cr index 1cdfdcf..5c182e3 100644 --- a/src/cbor/type/bytes_array.cr +++ b/src/cbor/type/bytes_array.cr @@ -1,5 +1,6 @@ class CBOR::BytesArray < Array(UInt8) def to_a : Array(UInt8) + self.as(Array(UInt8)) end def to_bytes : Bytes