From 2e2edd19087cc9dcd24b25420b29ecc62e6e1067 Mon Sep 17 00:00:00 2001
From: Alberto Restifo <alberto@restifo.dev>
Date: Mon, 20 Apr 2020 14:57:20 +0200
Subject: [PATCH] Work on lexer and diagnostic representation

---
 .build.yml                   |   2 +-
 spec/cbor/lexer_spec.cr      |   2 +-
 spec/rfc_spec.cr             | 103 +++++++++++++++++++++++++++++++++++
 spec/rfc_tests.cr            |  84 ----------------------------
 src/cbor/decoder.cr          |  21 +++++++
 src/cbor/diagnostic.cr       |  50 +++++++++++++----
 src/cbor/lexer.cr            |  52 ++++++------------
 src/cbor/token.cr            |  41 ++++++++++++--
 src/cbor/type/bytes_array.cr |   1 +
 9 files changed, 220 insertions(+), 136 deletions(-)
 create mode 100644 spec/rfc_spec.cr
 delete mode 100644 spec/rfc_tests.cr

diff --git a/.build.yml b/.build.yml
index b542f83..e6d0abf 100644
--- a/.build.yml
+++ b/.build.yml
@@ -12,4 +12,4 @@ tasks:
       crystal tool format --check
   - test: |
       cd crystal-cbor
-      crystal spec
+      crystal spec --error-on-warnings
diff --git a/spec/cbor/lexer_spec.cr b/spec/cbor/lexer_spec.cr
index ad9eb00..c544e33 100644
--- a/spec/cbor/lexer_spec.cr
+++ b/spec/cbor/lexer_spec.cr
@@ -24,7 +24,7 @@ describe CBOR::Lexer do
         it "reads #{tt[:bytes].hexstring} as #{tt[:value].to_s}" do
           lexer = CBOR::Lexer.new(tt[:bytes])
 
-          token = lexer.read_token
+          token = lexer.next_token
           token.should be_a(CBOR::Token::IntT)
           token.as(CBOR::Token::IntT).value.should eq(tt[:value])
         end
diff --git a/spec/rfc_spec.cr b/spec/rfc_spec.cr
new file mode 100644
index 0000000..ed8c4ee
--- /dev/null
+++ b/spec/rfc_spec.cr
@@ -0,0 +1,103 @@
+require "./spec_helper"
+
+tests = [
+  { %(0), "00" },
+  { %(1), "01" },
+  { %(10), "0a" },
+  { %(23), "17" },
+  { %(24), "18 18" },
+  { %(25), "18 19" },
+  { %(100), "18 64" },
+  { %(1000), "19 03 e8" },
+  { %(1000000), "1a 00 0f 42 40" },
+  { %(1000000000000), "1b 00 00 00 e8 d4 a5 10 00" },
+  { %(18446744073709551615), "1b ff ff ff ff ff ff ff ff" },
+  # { %(18446744073709551616), "c2 49 01 00 00 00 00 00 00 00 00" },
+  { %(-18446744073709551616), "3b ff ff ff ff ff ff ff ff" },
+  # { %(-18446744073709551617), "c3 49 01 00 00 00 00 00 00 00 00" },
+  { %(-1), "20" },
+  { %(-10), "29" },
+  { %(-100), "38 63" },
+  { %(-1000), "39 03 e7" },
+  # { %(0.0), "f9 00 00" },
+  # { %(-0.0), "f9 80 00" },
+  # { %(1.0), "f9 3c 00" },
+  # { %(1.1), "fb 3f f1 99 99 99 99 99 9a" },
+  # { %(1.5), "f9 3e 00" },
+  # { %(65504.0), "f9 7b ff" },
+  # { %(100000.0), "fa 47 c3 50 00" },
+  # { %(3.4028234663852886e+38), "fa 7f 7f ff ff" },
+  # { %(1.0e+300), "fb 7e 37 e4 3c 88 00 75 9c" },
+  # { %(5.960464477539063e-8), "f9 00 01" },
+  # { %(0.00006103515625), "f9 04 00" },
+  # { %(-4.0), "f9 c4 00" },
+  # { %(-4.1), "fb c0 10 66 66 66 66 66 66" },
+  # { %(Infinity), "f9 7c 00" },
+  # { %(NaN), "f9 7e 00" },
+  # { %(-Infinity), "f9 fc 00" },
+  # { %(Infinity), "fa 7f 80 00 00" },
+  # { %(NaN), "fa 7f c0 00 00" },
+  # { %(-Infinity), "fa ff 80 00 00" },
+  # { %(Infinity), "fb 7f f0 00 00 00 00 00 00" },
+  # { %(NaN), "fb 7f f8 00 00 00 00 00 00" },
+  # { %(-Infinity), "fb ff f0 00 00 00 00 00 00" },
+  # { %(false), "f4" },
+  # { %(true), "f5" },
+  # { %(null), "f6" },
+  # { %(undefined), "f7" },
+  # { %(simple(16)), "f0" },
+  # { %(simple(24)), "f8 18" },
+  # { %(simple(255)), "f8 ff" },
+  # { %(0("2013-03-21T20:04:00Z")), "c0 74 32 30 31 33 2d 30 33 2d 32 31 54 32 30 3a 30 34 3a 30 30 5a" },
+  # { %(1(1363896240)), "c1 1a 51 4b 67 b0" },
+  # { %(1(1363896240.5)), "c1 fb 41 d4 52 d9 ec 20 00 00" },
+  # { %(23(h'01020304')), "d7 44 01 02 03 04" },
+  # { %(24(h'6449455446')), "d8 18 45 64 49 45 54 46" },
+  # { %(32("http://www.example.com")), "d8 20 76 68 74 74 70 3a 2f 2f 77 77 77 2e 65 78 61 6d 70 6c 65 2e 63 6f 6d" },
+  { %(h''), "40" },
+  { %(h'01020304'), "44 01 02 03 04" },
+  # { %(""), "60" },
+  # { %("a"), "61 61" },
+  # { %("IETF"), "64 49 45 54 46" },
+  # { %(""\\"), "62 22 5c" },
+  # { %("\u00fc"), "62 c3 bc" },
+  # { %("\u6c34"), "63 e6 b0 b4" },
+  # { %("\ud800\udd51"), "64 f0 90 85 91" },
+  # { %([]), "80" },
+  # { %([1, 2, 3]), "83 01 02 03" },
+  # { %([1, [2, 3], [4, 5]]), "83 01 82 02 03 82 04 05" },
+  # { %([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), "98 19 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 14 15 16 17 18 18 18 19" },
+  # { %({}), "a0" },
+  # { %({1: 2, 3: 4}), "a2 01 02 03 04" },
+  # { %({"a": 1, "b": [2, 3]}), "a2 61 61 01 61 62 82 02 03" },
+  # { %(["a", {"b": "c"}]), "82 61 61 a1 61 62 61 63" },
+  # { %({"a": "A", "b": "B", "c": "C", "d": "D", "e": "E"}), "a5 61 61 61 41 61 62 61 42 61 63 61 43 61 64 61 44 61 65 61 45" },
+  { %((_ h'0102', h'030405')), "5f 42 01 02 43 03 04 05 ff" },
+  # { %((_ "strea", "ming")), "7f 65 73 74 72 65 61 64 6d 69 6e 67 ff" },
+  # { %([_ ]), "9f ff" },
+  # { %([_ 1, [2, 3], [_ 4, 5]]), "9f 01 82 02 03 9f 04 05 ff ff" },
+  # { %([_ 1, [2, 3], [4, 5]]), "9f 01 82 02 03 82 04 05 ff" },
+  # { %([1, [2, 3], [_ 4, 5]]), "83 01 82 02 03 9f 04 05 ff" },
+  # { %([1, [_ 2, 3], [4, 5]]), "83 01 9f 02 03 ff 82 04 05" },
+  # { %([_ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), "9f 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 14 15 16 17 18 18 18 19 ff" },
+  # { %({_ "a": 1, "b": [_ 2, 3]}), "bf 61 61 01 61 62 9f 02 03 ff ff" },
+  # { %(["a", {_ "b": "c"}]), "82 61 61 bf 61 62 61 63 ff" },
+  # { %({_ "Fun": true, "Amt": -2}), "bf 63 46 75 6e f5 63 41 6d 74 21 ff" },
+]
+
+describe "Examples from RFC7049 Appendix A" do
+  tests.each_with_index do |tt, index|
+    describe "test ##{index}" do
+      diagnostic, hex_string = tt
+
+      bytes_arr = hex_string.split.map(&.to_u8(16))
+      bytes = Bytes.new(bytes_arr.to_unsafe, bytes_arr.size)
+
+      it "reads #{bytes.hexstring} as #{diagnostic}" do
+        result = CBOR::Diagnostic.new(bytes).to_s
+
+        result.should eq(diagnostic)
+      end
+    end
+  end
+end
diff --git a/spec/rfc_tests.cr b/spec/rfc_tests.cr
deleted file mode 100644
index 919f256..0000000
--- a/spec/rfc_tests.cr
+++ /dev/null
@@ -1,84 +0,0 @@
-tests = [
-  { %(0), "00" },
-  { %(1), "01" },
-  { %(10), "0a" },
-  { %(23), "17" },
-  { %(24), "18 18" },
-  { %(25), "18 19" },
-  { %(100), "18 64" },
-  { %(1000), "19 03 e8" },
-  { %(1000000), "1a 00 0f 42 40" },
-  { %(1000000000000), "1b 00 00 00 e8 d4 a5 10 00" },
-  { %(18446744073709551615), "1b ff ff ff ff ff ff ff ff" },
-  { %(18446744073709551616), "c2 49 01 00 00 00 00 00 00 00 00" },
-  { %(-18446744073709551616), "3b ff ff ff ff ff ff ff ff" },
-  { %(-18446744073709551617), "c3 49 01 00 00 00 00 00 00 00 00" },
-  { %(-1), "20" },
-  { %(-10), "29" },
-  { %(-100), "38 63" },
-  { %(-1000), "39 03 e7" },
-  { %(0.0), "f9 00 00" },
-  { %(-0.0), "f9 80 00" },
-  { %(1.0), "f9 3c 00" },
-  { %(1.1), "fb 3f f1 99 99 99 99 99 9a" },
-  { %(1.5), "f9 3e 00" },
-  { %(65504.0), "f9 7b ff" },
-  { %(100000.0), "fa 47 c3 50 00" },
-  { %(3.4028234663852886e+38), "fa 7f 7f ff ff" },
-  { %(1.0e+300), "fb 7e 37 e4 3c 88 00 75 9c" },
-  { %(5.960464477539063e-8), "f9 00 01" },
-  { %(0.00006103515625), "f9 04 00" },
-  { %(-4.0), "f9 c4 00" },
-  { %(-4.1), "fb c0 10 66 66 66 66 66 66" },
-  { %(Infinity), "f9 7c 00" },
-  { %(NaN), "f9 7e 00" },
-  { %(-Infinity), "f9 fc 00" },
-  { %(Infinity), "fa 7f 80 00 00" },
-  { %(NaN), "fa 7f c0 00 00" },
-  { %(-Infinity), "fa ff 80 00 00" },
-  { %(Infinity), "fb 7f f0 00 00 00 00 00 00" },
-  { %(NaN), "fb 7f f8 00 00 00 00 00 00" },
-  { %(-Infinity), "fb ff f0 00 00 00 00 00 00" },
-  { %(false), "f4" },
-  { %(true), "f5" },
-  { %(null), "f6" },
-  { %(undefined), "f7" },
-  { %(simple(16)), "f0" },
-  { %(simple(24)), "f8 18" },
-  { %(simple(255)), "f8 ff" },
-  { %(0("2013-03-21T20:04:00Z")), "c0 74 32 30 31 33 2d 30 33 2d 32 31 54 32 30 3a 30 34 3a 30 30 5a" },
-  { %(1(1363896240)), "c1 1a 51 4b 67 b0" },
-  { %(1(1363896240.5)), "c1 fb 41 d4 52 d9 ec 20 00 00" },
-  { %(23(h'01020304')), "d7 44 01 02 03 04" },
-  { %(24(h'6449455446')), "d8 18 45 64 49 45 54 46" },
-  { %(32("http://www.example.com")), "d8 20 76 68 74 74 70 3a 2f 2f 77 77 77 2e 65 78 61 6d 70 6c 65 2e 63 6f 6d" },
-  { %(h''), "40" },
-  { %(h'01020304'), "44 01 02 03 04" },
-  { %(""), "60" },
-  { %("a"), "61 61" },
-  { %("IETF"), "64 49 45 54 46" },
-  { %(""\\"), "62225c" },
-  { %("\u00fc"), "62 c3 bc" },
-  { %("\u6c34"), "63 e6 b0 b4" },
-  { %("\ud800\udd51"), "64 f0 90 85 91" },
-  { %([]), "80" },
-  { %([1, 2, 3]), "83 01 02 03" },
-  { %([1, [2, 3], [4, 5]]), "83 01 82 02 03 82 04 05" },
-  { %([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), "98 19 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 14 15 16 17 18 18 18 19" },
-  { %({}), "a0" },
-  { %({1: 2, 3: 4}), "a2 01 02 03 04" },
-  { %({"a": 1, "b": [2, 3]}), "a2 61 61 01 61 62 82 02 03" },
-  { %(["a", {"b": "c"}]), "82 61 61 a1 61 62 61 63" },
-  { %({"a": "A", "b": "B", "c": "C", "d": "D", "e": "E"}), "a5 61 61 61 41 61 62 61 42 61 63 61 43 61 64 61 44 61 65 61 45" },
-  { %((_ h'0102', h'030405')), "5f 42 01 02 43 03 04 05 ff" },
-  { %((_ "strea", "ming")), "7f 65 73 74 72 65 61 64 6d 69 6e 67 ff" },
-  { %([_ ]), "9f ff" },
-  { %([_ 1, [2, 3], [_ 4, 5]]), "9f 01 82 02 03 9f 04 05 ff ff" },
-  { %([_ 1, [2, 3], [4, 5]]), "9f 01 82 02 03 82 04 05 ff" },
-  { %([1, [2, 3], [_ 4, 5]]), "83 01 82 02 03 9f 04 05 ff" },
-  { %([1, [_ 2, 3], [4, 5]]), "83 01 9f 02 03 ff 82 04 05" },
-  { %([_ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]), "9f 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 14 15 16 17 18 18 18 19 ff" },
-  { %({_ "a": 1, "b": [_ 2, 3]}), "bf 61 61 01 61 62 9f 02 03 ff ff" },
-  { %(["a", {_ "b": "c"}]), "82 61 61 bf 61 62 61 63 ff" },
-  { %({_ "Fun": true, "Amt": -2}), "bf 63 46 75 6e f5 63 41 6d 74 21 ff" },
-]
diff --git a/src/cbor/decoder.cr b/src/cbor/decoder.cr
index 50ff8b4..87f2ad7 100644
--- a/src/cbor/decoder.cr
+++ b/src/cbor/decoder.cr
@@ -22,4 +22,25 @@ abstract class CBOR::Decoder
       # Consume the array :)
     end
   end
+
+  private def read_bytes_array_body
+    read_type(Token::ByteArrayT) do |token|
+    end
+  end
+
+  private macro read_type(type, finish_token = true, &block)
+    case token = current_token
+    when {{type}}
+      {% if finish_token %}finish_token!{% end %}
+      {{ block.body }}
+    else
+      unexpected_token(token, {{type.stringify.split("::").last}})
+    end
+  end
+
+  private def unexpected_token(token, expected = nil)
+    message = "Unexpected token #{Token.to_s(token)}"
+    message += " expected #{expected}" if expected
+    raise TypeCastError.new(message, token.byte_number)
+  end
 end
diff --git a/src/cbor/diagnostic.cr b/src/cbor/diagnostic.cr
index 5129130..58ed19b 100644
--- a/src/cbor/diagnostic.cr
+++ b/src/cbor/diagnostic.cr
@@ -1,15 +1,45 @@
-module CBOR::Diagnostic
-  def to_s(value : CBOR::ByteArray) : String
-    value.to_diagnostic
+require "./lexer"
+require "./token"
+
+# Reads a CBOR input into a diagnostic string.
+# This consumes the IO and is mostly usedful to tests again the example
+# provided in the RFC and ensuring a correct functioning of the `CBOR::Lexer`.
+class CBOR::Diagnostic
+  @lexer : Lexer
+  @is_array : Bool = false
+
+  def initialize(input)
+    @lexer = Lexer.new(input)
   end
 
-  {% for type in [UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64, Int64, Int128] %}
-    def to_s(value : {{type}}) : String
-      {{type}}.to_s
-    end
-  {% end %}
+  # Reads the content of the IO and prints out a diagnostic string
+  # represation of the input.
+  def to_s : String
+    result = ""
 
-  def to_s(value : String)
-    %("#{value}")
+    while val = next_value
+      result += val
+    end
+
+    result
+  end
+
+  private def next_value : String?
+    token = @lexer.next_token
+    return nil unless token
+
+    case token
+    when Token::BytesArray
+      @is_array = true
+    when Token::BreakT
+      @is_array = flase
+    end
+
+    separator + Token.to_diagnostic(token)
+  end
+
+  private def separator : String
+    return ", " if @is_array
+    ""
   end
 end
diff --git a/src/cbor/lexer.cr b/src/cbor/lexer.cr
index e053ac4..d43dd20 100644
--- a/src/cbor/lexer.cr
+++ b/src/cbor/lexer.cr
@@ -11,44 +11,19 @@ class CBOR::Lexer
     new IO::Memory.new(slice)
   end
 
-  @token : Token::T
   @current_pos : Int64
-  @token_finished : Bool
+  @eof : Bool = false
 
   def initialize(@io : IO)
     @current_pos = 0
-    @token = Token::NullT.new(0)
-    @token_finished = true
   end
 
-  @[AlwaysInline]
-  def current_token : Token::T
-    if @token_finished
-      @token_finished = false
-      @token = next_token
-    else
-      @token
-    end
-  end
+  def next_token
+    return nil if @eof
 
-  @[AlwaysInline]
-  def finish_token!
-    @token_finished = true
-  end
-
-  @[AlwaysInline]
-  def read_token : Token::T
-    if @token_finished
-      @token = next_token
-    else
-      finish_token!
-    end
-    @token
-  end
-
-  private def next_token
     @current_pos = @io.pos.to_i64
-    current_byte = next_byte
+    current_byte = @io.read_byte
+    return nil unless current_byte
 
     case current_byte
     when 0x00..0x17
@@ -85,16 +60,23 @@ class CBOR::Lexer
     when 0x5b
       consume_binary(read(UInt64))
     when 0x5f
-      Token::BytesArrayT.new(@current_pos)
+      Token::BytesArrayStartT.new(@current_pos)
+    when 0xff
+      # TODO: Define which segment it's breaking
+      Token::BreakT.new(@current_pos)
     else
-      raise ParseError.new("Unexpected first byte #{current_byte}")
+      raise ParseError.new("Unexpected first byte 0x#{current_byte.to_s(16)}")
     end
   end
 
-  private def next_byte : UInt8
+  private def next_byte : UInt8?
     byte = @io.read_byte
-    raise ParseError.new("Unexpected EOF at byte #{@io.pos}") unless byte
-    byte
+    if byte
+      byte
+    else
+      @eof = true
+      nil
+    end
   end
 
   private def consume_int(value)
diff --git a/src/cbor/token.cr b/src/cbor/token.cr
index 1247769..0ce1714 100644
--- a/src/cbor/token.cr
+++ b/src/cbor/token.cr
@@ -1,5 +1,6 @@
 class CBOR::Token
   record NullT, byte_number : Int64
+  record UndefinedT, byte_number : Int64
   record BoolT, byte_number : Int64, value : Bool
   record ArrayT, byte_number : Int64, size : UInt32?
   record MapT, byte_number : Int64, size : UInt32?
@@ -7,12 +8,26 @@ class CBOR::Token
   record FloatT, byte_number : Int64, value : Float64
   record StringT, byte_number : Int64, value : String
   record BytesT, byte_number : Int64, value : Bytes
-  record StringArrayT, byte_number : Int64
-  record BytesArrayT, byte_number : Int64
+  record StringArrayStartT, byte_number : Int64
+  record StringArrayEndT, byte_number : Int64
+  record BytesArrayStartT, byte_number : Int64
+  record BytesArrayEndT, byte_number : Int64
 
-  alias T = NullT | BoolT | ArrayT | MapT | IntT | FloatT | StringT | BytesT | StringArrayT | BytesArrayT
+  alias T = NullT |
+            UndefinedT |
+            BoolT |
+            ArrayT |
+            MapT |
+            IntT |
+            FloatT |
+            StringT |
+            BytesT |
+            StringArrayStartT |
+            StringArrayEndT |
+            BytesArrayStartT |
+            BytesArrayEndT
 
-  def self.to_s(token : T)
+  def self.to_diagnostic(token : T) : String
     case token
     when IntT
       token.value.to_s
@@ -23,8 +38,24 @@ class CBOR::Token
       "null"
     when UndefinedT
       "undefined"
+    when BoolT
+      token.value.to_s
+    when BytesArrayStartT
+      "(_ "
+    when BytesArrayEndT
+      ")"
+    when FloatT
+      "TODO"
+    when StringT
+      "TODO"
+    when StringArrayT
+      "TODO"
+    when MapT
+      "TODO"
+    when ArrayT
+      "TODO"
     else
-      raise "Diagnostic notation for type #{token.class} not implemented"
+      raise "Uknown diagnostics representation for #{token.class}"
     end
   end
 end
diff --git a/src/cbor/type/bytes_array.cr b/src/cbor/type/bytes_array.cr
index 1cdfdcf..5c182e3 100644
--- a/src/cbor/type/bytes_array.cr
+++ b/src/cbor/type/bytes_array.cr
@@ -1,5 +1,6 @@
 class CBOR::BytesArray < Array(UInt8)
   def to_a : Array(UInt8)
+    self.as(Array(UInt8))
   end
 
   def to_bytes : Bytes