parser/src/DomainParser.purs

-- | `DomainParser` is a simple parser for domain names as described in RFC 1035.
module DomainParser where

import Prelude (bind, not, pure, ($), (&&), (*>), (<<<), (<>), (>), (-))

import Control.Lazy (defer)
import Data.Maybe (Maybe(..), maybe)
import Data.Either (Either(..))
import Data.Array as A
import Data.String as S
import Data.String.CodeUnits as CU
import Control.Alt ((<|>))
-- import Control.Plus (empty)

import Parser (Parser(..)
              , success, failError
              , current_position
              , alphanum, char, letter, many1, parse, string)

type Size = Int
data DomainError
  = SubdomainTooLarge Size
  | DomainTooLarge Size
  | InvalidCharacter
  | EOFExpected

-- | From RFC 1035: <let-dig> ::= <letter> | <digit>
let_dig :: forall e. Parser e Char
let_dig = alphanum

-- | From RFC 1035: <let-dig-hyp> ::= <let-dig> | "-"
-- | Either a Letter, Digital or an Hyphenation character.
let_dig_hyp :: forall e. Parser e Char
let_dig_hyp = let_dig <|> char '-'

-- | From RFC 1035: <ldh-str> ::= <let-dig-hyp> | <let-dig-hyp> <ldh-str>
ldh_str :: forall e. Parser e (Array Char)
ldh_str = many1 let_dig_hyp

-- TODO: 63
label_maxsize :: Int
label_maxsize = 7

-- TODO: 255?
max_domain_length :: Int
max_domain_length = 15

last_char :: String -> Maybe Char
last_char = A.last <<< CU.toCharArray

parse_last_char :: forall e. String -> Parser e Char -> Boolean
parse_last_char s p = case last_char s of
  Nothing -> false
  Just c -> case parse p { string: CU.singleton c, position: 0 } of
    Left _  -> false
    _       -> true

-- | `tryMaybe` provides a way to accept a faulty parser and
-- | just rewinds back to previous input state if an error occurs.
tryMaybe :: forall e a. Parser e a -> Parser e (Maybe a)
tryMaybe p = Parser p'
  where p' input = case parse p input of
                   Left _                   -> Right { suffix: input, result: Nothing }
                   Right { suffix, result } -> Right { suffix,        result: Just result }

-- | `try` provides a way to accept a faulty parser and
-- | just rewinds back to previous input state if a non-specific error occurs.
try :: forall e a. Parser e a -> Parser e (Maybe a)
try p = Parser p'
  where p' input = case parse p input of
                   Left { position, error } -> case error of
                     Nothing -> Right { suffix: input, result: Nothing }
                     _       -> Left { position, error }
                   Right { suffix, result } -> Right { suffix, result: Just result }

-- | From RFC 1035: <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
label :: Parser DomainError String
label = do
  pos <- current_position
  maybel <- tryMaybe letter
  case maybel of
    Nothing -> Parser \_ -> failError pos (Just InvalidCharacter)
    Just l -> do
      s <- tryMaybe ldh_str
      lastpos <- current_position
      let labelstr = CU.singleton l <> maybe "" (\v -> CU.fromCharArray v) s
      if (S.length labelstr > label_maxsize)
      then Parser \_ -> failError pos (Just <<< SubdomainTooLarge $ S.length labelstr)
      else if (S.length labelstr > 1 && not (parse_last_char labelstr let_dig))
           then Parser \_ -> failError (lastpos - 1) (Just InvalidCharacter)
           else pure labelstr

-- | From RFC 1035: <subdomain> ::= <label> | <subdomain> "." <label>
subdomain :: Parser DomainError String
subdomain = do
  -- First: read a label. This is bare minimum for a subdomain.
  lab <- label
  upperlabels <- try do
    _ <- char '.'
    sub <- defer \_ -> subdomain
    pure sub
  case upperlabels of
    Nothing -> pure lab
    Just l  -> pure $ lab <> "." <> l

-- | Test for the end-of-file (no more input).
-- | If not EOF the parser fails (Nothing), otherwise it provides an empty string.
eof :: Parser DomainError String
eof = Parser \input -> case S.length input.string of
  0 -> success input ""
  _ -> failError input.position (Just EOFExpected)

-- | Test for the domain to be a list of subdomains then an end-of-file.
-- | Said otherwise, the input must only contain a domain (with or without a final dot '.').
sub_eof :: Parser DomainError String
sub_eof = do
  sub <- subdomain
  maybe_final_point <- tryMaybe $ char '.'
  _ <- eof -- In case there is still some input, it fails.
  pos <- current_position
  let parsed_domain = did_we_parsed_the_final_point maybe_final_point sub
  if S.length parsed_domain > max_domain_length
  then Parser \_ -> failError pos (Just <<< DomainTooLarge $ S.length parsed_domain)
  else pure parsed_domain
  where
    did_we_parsed_the_final_point Nothing sub = sub
    did_we_parsed_the_final_point _       sub = sub <> "."

-- | From RFC 1035: <domain> ::= <subdomain> | " "
-- |
-- | Accepting an optional '.' at the end of the subdomain doesn't conform
-- | to the (prefered) syntax of a domain as described in RFC 1035.
-- | However, this last '.' character should be acceptable in most applications.
-- | In some cases, a fully qualified domain name (FQDN) such as `example.com.`
-- | has to be differenciated from a "relative" name (www).
domain :: Parser DomainError String
domain = (string " " *> eof) <|> sub_eof