darcsweb - src/DarcsWeb/Clone.hs

[root] / src / DarcsWeb / Clone.hs
{-# LANGUAGE OverloadedStrings #-}

-- | Helpers for the read-only darcs HTTP clone endpoint.
module DarcsWeb.Clone
  ( buildCloneUrl
  , encodePathSegment
  ) where

import           Data.Bits (shiftR, (.&.))
import           Data.Char (isAsciiLower, isAsciiUpper, isDigit, ord)
import           Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.ByteString as BS
import           Data.Word (Word8)

-- | Construct the darcs-clone URL for a repository from the scheme and host
--   observed on the incoming request. Both arguments accept 'Nothing' (or
--   malformed / whitespace-only values) and fall back to a safe default so
--   the summary page cannot be crashed by crafted request headers.
--
--   Guarantees:
--
--   * the scheme is always either @http@ or @https@;
--   * the authority is always non-empty (defaults to @localhost@) and
--     only contains characters legal in a URI authority;
--   * the repository name is percent-encoded as a single path segment, so
--     spaces / reserved characters cannot break out of the path.
--
--   The trailing @/@ is required: @darcs clone@ treats the URL as a
--   directory and appends paths like @_darcs/hashed_inventory@.
buildCloneUrl
  :: Maybe Text  -- ^ value of the X-Forwarded-Proto header, if any
  -> Maybe Text  -- ^ value of the Host header, if any
  -> Text        -- ^ repository name (already validated upstream)
  -> Text
buildCloneUrl mProto mHost name =
    proto <> "://" <> host <> "/clone/" <> encodePathSegment name <> "/"
  where
    proto = maybe "http" normalizeProto mProto
    host  = case mHost >>= sanitizeHost of
              Just h  -> h
              Nothing -> "localhost"

-- | Only allow http/https from the proxy hint; anything else becomes http.
--   Tolerates the @header, header@ form that some proxies emit
--   (e.g. @X-Forwarded-Proto: https, http@) by inspecting only the first
--   comma-separated token.
normalizeProto :: Text -> Text
normalizeProto v =
    let v' = T.toLower (T.strip (firstToken v))
    in if v' == "https" then "https" else "http"

-- | Validate and return a cleaned @Host@ value, or 'Nothing' if it cannot
--   be safely rendered into a URL. Accepts one of:
--
--   * @reg-name[:port]@ — letters / digits / @.-_@, optional numeric port;
--   * @[IPv6-literal][:port]@ — bracketed IPv6 with optional numeric port.
--
--   Any other shape (bare @:@, unclosed bracket, multiple ports, extra
--   commas, etc.) falls through to 'Nothing' so the caller uses the
--   @localhost@ fallback.
sanitizeHost :: Text -> Maybe Text
sanitizeHost raw =
    let h = T.strip (firstToken raw)
    in if T.null h || not (validAuthority h) then Nothing else Just h

validAuthority :: Text -> Bool
validAuthority t
    | T.any (== ',') t = False
    | T.isPrefixOf "[" t = validBracketed t
    | otherwise          = validRegName t

-- Bracketed IPv6 literal with optional port:  @[...]@ or @[...]:port@
validBracketed :: Text -> Bool
validBracketed t =
    case T.breakOn "]" (T.drop 1 t) of
      (inner, rest)
        | T.null inner      -> False
        | not (T.all isIpv6Char inner) -> False
        | rest == "]"       -> True
        | Just port <- T.stripPrefix "]:" rest
                            -> validPort port
        | otherwise         -> False
  where
    isIpv6Char c =
      isAsciiLower c || isAsciiUpper c || isDigit c
      || c == ':' || c == '.'

-- Reg-name or IPv4 literal with optional port.
validRegName :: Text -> Bool
validRegName t =
    case T.splitOn ":" t of
      [h]        -> nameLike h
      [h, p]     -> nameLike h && validPort p
      _          -> False  -- multiple colons without IPv6 brackets
  where
    nameLike h =
      not (T.null h) && T.all nameChar h
    nameChar c =
      isAsciiLower c || isAsciiUpper c || isDigit c
      || c == '.' || c == '-' || c == '_'

validPort :: Text -> Bool
validPort p =
    not (T.null p) && T.length p <= 5 && T.all isDigit p

-- | First comma-separated token of a header value, e.g. \"https\" from
--   \"https, http\".
firstToken :: Text -> Text
firstToken = fst . T.break (== ',')

-- | A single path segment, percent-encoded so that spaces and reserved
--   characters (@/@, @?@, @#@, @\@, ...) cannot alter the URL structure.
--   Unreserved characters per RFC 3986 § 2.3 are passed through verbatim;
--   everything else is UTF-8 encoded and hex-escaped.
encodePathSegment :: Text -> Text
encodePathSegment t =
    T.concat [ encodeByte b | b <- BS.unpack (TE.encodeUtf8 t) ]
  where
    encodeByte :: Word8 -> Text
    encodeByte b
      | isUnreserved b = T.singleton (toEnum (fromIntegral b))
      | otherwise      = T.pack ['%', hex (b `shiftR` 4), hex (b .&. 0x0F)]

    isUnreserved b =
      let c = toEnum (fromIntegral b) :: Char
      in isAsciiLower c || isAsciiUpper c || isDigit c
         || c == '-' || c == '_' || c == '.' || c == '~'

    hex :: Word8 -> Char
    hex n | n < 10    = toEnum (ord '0' + fromIntegral n)
          | otherwise = toEnum (ord 'A' + fromIntegral n - 10)