1 files changed, 374 insertions, 0 deletions
diff --git a/src/java_tokens.cc b/src/java_tokens.cc
new file mode 100644
index 0000000..59748c1
--- /dev/null
+++ b/src/java_tokens.cc
@@ -0,0 +1,374 @@
+#include "java_tokens.hh"
+
+#include "errors.hh"
+#include "java_tokens_java-8.hh"
+#include "java_uescape.hh"
+#include "str.hh"
+#include "u8.hh"
+#include "uline.hh"
+
+#include <cassert>
+#include <charconv>
+#include <cstddef>
+#include <expected>
+#include <format>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <utility>
+
+namespace java {
+
+namespace {
+
+template <typename MatchNext, typename MatchToken>
+class TokensImpl : public Tokens {
+ public:
+  TokensImpl(std::unique_ptr<io::Reader> reader,
+             std::unique_ptr<src::Errors> errors, TokensConfig config)
+      : reader_(u8::line::open(u8::java::open(std::move(reader)))),
+        errors_(std::move(errors)),
+        config_(config) {}
+
+  std::expected<Token, io::ReadError> read() override {
+    while (true) {
+      while (line_.empty()) {
+        auto maybe_line = reader_->read();
+        if (!maybe_line.has_value())
+          return std::unexpected(maybe_line.error());
+        line_ = maybe_line.value();
+        location_.line = reader_->number();
+        location_.column = 0;
+      }
+
+      Token token;
+      token.loc = location_;
+
+      auto maybe_token_pair = match_next_(line_);
+      if (maybe_token_pair.has_value()) {
+        token.str = line_.substr(0, maybe_token_pair->second);
+        location_.column += u8len(token.str);
+        line_ = line_.substr(maybe_token_pair->second);
+        switch (maybe_token_pair->first) {
+          case MatchToken::kBinaryIntegerLiteral:
+            handle_int_literal(token, /* base */ 2);
+            break;
+          case MatchToken::kBooleanLiteral:
+            token.type = Token::Type::kLiteralBoolean;
+            token.int_value = token.str != "false";
+            break;
+          case MatchToken::kCharacterLiteral:
+            token.type = Token::Type::kLiteralCharacter;
+            if (token.str[1] == '\\') {
+              token.int_value =
+                  unescape(token.str.substr(1, token.str.size() - 2)).first;
+            } else {
+              auto* ptr =
+                  reinterpret_cast<uint8_t const*>(token.str.data() + 1);
+              auto* end = ptr + token.str.size() - 2;
+              token.int_value = u8::read(ptr, end).value();
+            }
+            break;
+          case MatchToken::kDecimalFloatingPointLiteral:
+            handle_float_literal(token);
+            break;
+          case MatchToken::kDecimalIntegerLiteral:
+            handle_int_literal(token);
+            break;
+          case MatchToken::kEndOfLineComment:
+            token.type = Token::Type::kComment;
+            token.str = str::trim(token.str.substr(2));
+            break;
+          case MatchToken::kHexIntegerLiteral:
+            handle_int_literal(token, /* base */ 16);
+            break;
+          case MatchToken::kHexadecimalFloatingPointLiteral:
+            handle_float_literal(token, /* base */ 16);
+            break;
+          case MatchToken::kIdentifier:
+            token.type = Token::Type::kIdentifier;
+            break;
+          case MatchToken::kKeyword:
+            token.type = Token::Type::kKeyword;
+            break;
+          case MatchToken::kNullLiteral:
+            token.type = Token::Type::kLiteralNull;
+            break;
+          case MatchToken::kOctalIntegerLiteral:
+            handle_int_literal(token, /* base */ 8);
+            break;
+          case MatchToken::kOperator:
+            token.type = Token::Type::kOperator;
+            break;
+          case MatchToken::kSeparator:
+            token.type = Token::Type::kSeparator;
+            break;
+          case MatchToken::kStringLiteral:
+            token.type = Token::Type::kLiteralString;
+            token.str =
+                unescape_if_needed(token.str.substr(1, token.str.size() - 2));
+            break;
+          case MatchToken::kTraditionalComment: {
+            token.type = Token::Type::kComment;
+            size_t s = 2;
+            while (s < token.str.size() && token.str[s] == '*')
+              ++s;
+            token.str =
+                str::trim(token.str.substr(s, token.str.size() - 2 - s));
+            token.int_value = static_cast<int64_t>(s - 1);
+            // TODO: handle multiline
+            break;
+          }
+          case MatchToken::kWhiteSpace:
+            continue;
+        }
+      } else {
+        errors_->err(location_, std::format("Invalid token: {}", line_));
+        token.type = Token::Type::kError;
+        token.str = line_;
+      }
+      return token;
+    }
+  }
+
+ private:
+  void handle_int_literal_error(Token& token, std::string_view str,
+                                std::errc err, int base) {
+    if (err == std::errc::result_out_of_range) {
+      // Java assumes two completent (so 0xffff_ffff is -1) and also, negative literals
+      // are read as positive (because the operator '-' is a separate token)
+      uint64_t tmp;
+      auto ret =
+          std::from_chars(str.data(), str.data() + str.size(), tmp, base);
+      if (ret.ec == std::errc()) {
+        token.type = ret.ptr < str.data() + str.size()
+                         ? Token::Type::kLiteralLong
+                         : Token::Type::kLiteralInt;
+        token.int_value = static_cast<int64_t>(tmp);
+        return;
+      }
+    }
+    errors_->err(location_,
+                 std::format("Invalid integer literal: {}", token.str));
+    token.type = Token::Type::kError;
+  }
+
+  void handle_int_literal(Token& token, int base = 10) {
+    size_t prefix;
+    switch (base) {
+      case 16:  // 0x
+      case 2:   // 0b
+        prefix = 2;
+        break;
+      case 8:  // 0
+        prefix = 1;
+        break;
+      default:
+        prefix = 0;
+        break;
+    }
+    std::optional<char> suffix;
+    if (token.str.find('_') == std::string_view::npos) {
+      auto ret = std::from_chars(token.str.data() + prefix,
+                                 token.str.data() + token.str.size(),
+                                 token.int_value, base);
+      if (ret.ec != std::errc()) {
+        handle_int_literal_error(token, token.str.substr(prefix), ret.ec, base);
+        return;
+      }
+      if (ret.ptr < token.str.data() + token.str.size())
+        suffix = *ret.ptr;
+    } else {
+      std::string tmp;
+      tmp.reserve(token.str.size() - prefix);
+      for (size_t i = prefix; i < token.str.size(); ++i) {
+        if (token.str[i] != '_') {
+          tmp.push_back(token.str[i]);
+        }
+      }
+      auto ret = std::from_chars(tmp.data(), tmp.data() + tmp.size(),
+                                 token.int_value, base);
+      if (ret.ec != std::errc()) {
+        handle_int_literal_error(token, tmp, ret.ec, base);
+        return;
+      }
+      if (ret.ptr < tmp.data() + tmp.size())
+        suffix = *ret.ptr;
+    }
+    if (suffix.has_value() &&
+        (suffix.value() == 'l' || suffix.value() == 'L')) {
+      token.type = Token::Type::kLiteralLong;
+    } else {
+      if (base == 10 &&
+          token.int_value >
+              static_cast<int64_t>(1) + std::numeric_limits<int32_t>::max()) {
+        errors_->err(location_,
+                     std::format("Invalid integer literal: {}", token.str));
+        token.type = Token::Type::kError;
+        return;
+      }
+      if (std::cmp_greater(token.int_value,
+                           std::numeric_limits<uint32_t>::max())) {
+        errors_->err(location_,
+                     std::format("Invalid integer literal: {}", token.str));
+        token.type = Token::Type::kError;
+        return;
+      }
+      token.type = Token::Type::kLiteralInt;
+      token.int_value = static_cast<int32_t>(token.int_value);
+    }
+  }
+
+  void handle_float_literal(Token& token, int base = 10) {
+    size_t prefix;
+    std::chars_format fmt;
+    switch (base) {
+      case 16:  // 0x
+        fmt = std::chars_format::general | std::chars_format::hex;
+        prefix = 2;
+        break;
+      default:
+        fmt = std::chars_format::general;
+        prefix = 0;
+        break;
+    }
+
+    std::from_chars_result ret;
+    if (token.str.ends_with("f") || token.str.ends_with("F")) {
+      // float and double do not parse exactly the same, so use a float parser for float.
+      float tmp;
+      ret = std::from_chars(token.str.data() + prefix,
+                            token.str.data() + token.str.size(), tmp, fmt);
+      token.type = Token::Type::kLiteralFloatingPoint;
+      token.float_value = tmp;
+    } else {
+      ret = std::from_chars(token.str.data() + prefix,
+                            token.str.data() + token.str.size(),
+                            token.float_value, fmt);
+      token.type = Token::Type::kLiteralDoubleFloatingPoint;
+    }
+
+    if (ret.ec != std::errc()) {
+      // Java allows 0 with just a suffix, std::from_chars does not.
+      if (token.str == "0f" || token.str == "0F") {
+        token.type = Token::Type::kLiteralFloatingPoint;
+        token.float_value = 0;
+        return;
+      }
+      if (token.str == "0d" || token.str == "0D") {
+        token.type = Token::Type::kLiteralDoubleFloatingPoint;
+        token.float_value = 0;
+        return;
+      }
+      errors_->err(location_,
+                   std::format("Invalid float literal: {}", token.str));
+      token.type = Token::Type::kError;
+    }
+  }
+
+  std::string_view unescape_if_needed(std::string_view str) {
+    auto back_slash = str.find('\\');
+    if (back_slash == std::string_view::npos)
+      return str;
+    unescape_tmp_.clear();
+    unescape_tmp_.reserve(str.size());
+    size_t last = 0;
+    uint8_t tmp[4];
+    while (true) {
+      unescape_tmp_.append(str, last, back_slash - last);
+      auto ret = unescape(str.substr(back_slash));
+      auto* ptr = tmp;
+      u8::write(ptr, tmp + sizeof(tmp), ret.first);
+      unescape_tmp_.append(
+          std::string_view(reinterpret_cast<char*>(tmp), ptr - tmp));
+      last = back_slash + ret.second;
+      back_slash = str.find('\\', last);
+      if (back_slash == std::string::npos) {
+        unescape_tmp_.append(str, last);
+        break;
+      }
+    }
+    return unescape_tmp_;
+  }
+
+  // Strings coming here have already been validated by the tokenizer
+  static std::pair<uint16_t, size_t> unescape(std::string_view in) {
+    assert(in.front() == '\\');
+    assert(in.size() > 1);
+    switch (in[1]) {
+      case 'b':
+        return std::make_pair(8, 2);
+      case 't':
+        return std::make_pair(9, 2);
+      case 'n':
+        return std::make_pair(10, 2);
+      case 'f':
+        return std::make_pair(12, 2);
+      case 'r':
+        return std::make_pair(13, 2);
+      case '"':
+        return std::make_pair(34, 2);
+      case '\'':
+        return std::make_pair(39, 2);
+      case '\\':
+        return std::make_pair(92, 2);
+      case '0':
+      case '1':
+      case '2':
+      case '3':
+      case '4':
+      case '5':
+      case '6':
+      case '7': {
+        uint8_t tmp;
+        auto ret = std::from_chars(in.data() + 1, in.data() + in.size(), tmp,
+                                   /* base */ 8);
+        return std::make_pair(tmp, ret.ptr - in.data());
+      }
+      default:
+        std::unreachable();
+    }
+  }
+
+  static size_t u8len(std::string_view str) {
+    auto* ptr = reinterpret_cast<uint8_t const*>(str.data());
+    auto* const end = ptr + str.size();
+    size_t count = 0;
+    while (u8::skip(ptr, end))
+      ++count;
+    return count;
+  }
+
+  std::unique_ptr<u8::line::Reader> reader_;
+  std::unique_ptr<src::Errors> errors_;
+  TokensConfig const config_;
+  MatchNext match_next_;
+  std::string_view line_;
+  Location location_;
+  std::string unescape_tmp_;
+};
+
+struct MatchNextJava8 {
+  std::optional<std::pair<java_8::Token, size_t>> operator()(
+      std::string_view str) const {
+    return java_8::matchNext(str);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
+                             std::unique_ptr<src::Errors> errors,
+                             TokensConfig config) {
+  switch (config.version) {
+    case Version::kJava8:
+      return std::make_unique<TokensImpl<MatchNextJava8, java_8::Token>>(
+          std::move(reader), std::move(errors), config);
+  }
+  std::unreachable();
+}
+
+}  // namespace java