summaryrefslogtreecommitdiff
path: root/src/java_tokens.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/java_tokens.cc')
-rw-r--r--src/java_tokens.cc374
1 files changed, 374 insertions, 0 deletions
diff --git a/src/java_tokens.cc b/src/java_tokens.cc
new file mode 100644
index 0000000..59748c1
--- /dev/null
+++ b/src/java_tokens.cc
@@ -0,0 +1,374 @@
+#include "java_tokens.hh"
+
+#include "errors.hh"
+#include "java_tokens_java-8.hh"
+#include "java_uescape.hh"
+#include "str.hh"
+#include "u8.hh"
+#include "uline.hh"
+
+#include <cassert>
+#include <charconv>
+#include <cstddef>
+#include <expected>
+#include <format>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <utility>
+
+namespace java {
+
+namespace {
+
+template <typename MatchNext, typename MatchToken>
+class TokensImpl : public Tokens {
+ public:
+ TokensImpl(std::unique_ptr<io::Reader> reader,
+ std::unique_ptr<src::Errors> errors, TokensConfig config)
+ : reader_(u8::line::open(u8::java::open(std::move(reader)))),
+ errors_(std::move(errors)),
+ config_(config) {}
+
+ std::expected<Token, io::ReadError> read() override {
+ while (true) {
+ while (line_.empty()) {
+ auto maybe_line = reader_->read();
+ if (!maybe_line.has_value())
+ return std::unexpected(maybe_line.error());
+ line_ = maybe_line.value();
+ location_.line = reader_->number();
+ location_.column = 0;
+ }
+
+ Token token;
+ token.loc = location_;
+
+ auto maybe_token_pair = match_next_(line_);
+ if (maybe_token_pair.has_value()) {
+ token.str = line_.substr(0, maybe_token_pair->second);
+ location_.column += u8len(token.str);
+ line_ = line_.substr(maybe_token_pair->second);
+ switch (maybe_token_pair->first) {
+ case MatchToken::kBinaryIntegerLiteral:
+ handle_int_literal(token, /* base */ 2);
+ break;
+ case MatchToken::kBooleanLiteral:
+ token.type = Token::Type::kLiteralBoolean;
+ token.int_value = token.str != "false";
+ break;
+ case MatchToken::kCharacterLiteral:
+ token.type = Token::Type::kLiteralCharacter;
+ if (token.str[1] == '\\') {
+ token.int_value =
+ unescape(token.str.substr(1, token.str.size() - 2)).first;
+ } else {
+ auto* ptr =
+ reinterpret_cast<uint8_t const*>(token.str.data() + 1);
+ auto* end = ptr + token.str.size() - 2;
+ token.int_value = u8::read(ptr, end).value();
+ }
+ break;
+ case MatchToken::kDecimalFloatingPointLiteral:
+ handle_float_literal(token);
+ break;
+ case MatchToken::kDecimalIntegerLiteral:
+ handle_int_literal(token);
+ break;
+ case MatchToken::kEndOfLineComment:
+ token.type = Token::Type::kComment;
+ token.str = str::trim(token.str.substr(2));
+ break;
+ case MatchToken::kHexIntegerLiteral:
+ handle_int_literal(token, /* base */ 16);
+ break;
+ case MatchToken::kHexadecimalFloatingPointLiteral:
+ handle_float_literal(token, /* base */ 16);
+ break;
+ case MatchToken::kIdentifier:
+ token.type = Token::Type::kIdentifier;
+ break;
+ case MatchToken::kKeyword:
+ token.type = Token::Type::kKeyword;
+ break;
+ case MatchToken::kNullLiteral:
+ token.type = Token::Type::kLiteralNull;
+ break;
+ case MatchToken::kOctalIntegerLiteral:
+ handle_int_literal(token, /* base */ 8);
+ break;
+ case MatchToken::kOperator:
+ token.type = Token::Type::kOperator;
+ break;
+ case MatchToken::kSeparator:
+ token.type = Token::Type::kSeparator;
+ break;
+ case MatchToken::kStringLiteral:
+ token.type = Token::Type::kLiteralString;
+ token.str =
+ unescape_if_needed(token.str.substr(1, token.str.size() - 2));
+ break;
+ case MatchToken::kTraditionalComment: {
+ token.type = Token::Type::kComment;
+ size_t s = 2;
+ while (s < token.str.size() && token.str[s] == '*')
+ ++s;
+ token.str =
+ str::trim(token.str.substr(s, token.str.size() - 2 - s));
+ token.int_value = static_cast<int64_t>(s - 1);
+ // TODO: handle multiline
+ break;
+ }
+ case MatchToken::kWhiteSpace:
+ continue;
+ }
+ } else {
+ errors_->err(location_, std::format("Invalid token: {}", line_));
+ token.type = Token::Type::kError;
+ token.str = line_;
+ }
+ return token;
+ }
+ }
+
+ private:
+ void handle_int_literal_error(Token& token, std::string_view str,
+ std::errc err, int base) {
+ if (err == std::errc::result_out_of_range) {
+ // Java assumes two completent (so 0xffff_ffff is -1) and also, negative literals
+ // are read as positive (because the operator '-' is a separate token)
+ uint64_t tmp;
+ auto ret =
+ std::from_chars(str.data(), str.data() + str.size(), tmp, base);
+ if (ret.ec == std::errc()) {
+ token.type = ret.ptr < str.data() + str.size()
+ ? Token::Type::kLiteralLong
+ : Token::Type::kLiteralInt;
+ token.int_value = static_cast<int64_t>(tmp);
+ return;
+ }
+ }
+ errors_->err(location_,
+ std::format("Invalid integer literal: {}", token.str));
+ token.type = Token::Type::kError;
+ }
+
+ void handle_int_literal(Token& token, int base = 10) {
+ size_t prefix;
+ switch (base) {
+ case 16: // 0x
+ case 2: // 0b
+ prefix = 2;
+ break;
+ case 8: // 0
+ prefix = 1;
+ break;
+ default:
+ prefix = 0;
+ break;
+ }
+ std::optional<char> suffix;
+ if (token.str.find('_') == std::string_view::npos) {
+ auto ret = std::from_chars(token.str.data() + prefix,
+ token.str.data() + token.str.size(),
+ token.int_value, base);
+ if (ret.ec != std::errc()) {
+ handle_int_literal_error(token, token.str.substr(prefix), ret.ec, base);
+ return;
+ }
+ if (ret.ptr < token.str.data() + token.str.size())
+ suffix = *ret.ptr;
+ } else {
+ std::string tmp;
+ tmp.reserve(token.str.size() - prefix);
+ for (size_t i = prefix; i < token.str.size(); ++i) {
+ if (token.str[i] != '_') {
+ tmp.push_back(token.str[i]);
+ }
+ }
+ auto ret = std::from_chars(tmp.data(), tmp.data() + tmp.size(),
+ token.int_value, base);
+ if (ret.ec != std::errc()) {
+ handle_int_literal_error(token, tmp, ret.ec, base);
+ return;
+ }
+ if (ret.ptr < tmp.data() + tmp.size())
+ suffix = *ret.ptr;
+ }
+ if (suffix.has_value() &&
+ (suffix.value() == 'l' || suffix.value() == 'L')) {
+ token.type = Token::Type::kLiteralLong;
+ } else {
+ if (base == 10 &&
+ token.int_value >
+ static_cast<int64_t>(1) + std::numeric_limits<int32_t>::max()) {
+ errors_->err(location_,
+ std::format("Invalid integer literal: {}", token.str));
+ token.type = Token::Type::kError;
+ return;
+ }
+ if (std::cmp_greater(token.int_value,
+ std::numeric_limits<uint32_t>::max())) {
+ errors_->err(location_,
+ std::format("Invalid integer literal: {}", token.str));
+ token.type = Token::Type::kError;
+ return;
+ }
+ token.type = Token::Type::kLiteralInt;
+ token.int_value = static_cast<int32_t>(token.int_value);
+ }
+ }
+
+ void handle_float_literal(Token& token, int base = 10) {
+ size_t prefix;
+ std::chars_format fmt;
+ switch (base) {
+ case 16: // 0x
+ fmt = std::chars_format::general | std::chars_format::hex;
+ prefix = 2;
+ break;
+ default:
+ fmt = std::chars_format::general;
+ prefix = 0;
+ break;
+ }
+
+ std::from_chars_result ret;
+ if (token.str.ends_with("f") || token.str.ends_with("F")) {
+ // float and double do not parse exactly the same, so use a float parser for float.
+ float tmp;
+ ret = std::from_chars(token.str.data() + prefix,
+ token.str.data() + token.str.size(), tmp, fmt);
+ token.type = Token::Type::kLiteralFloatingPoint;
+ token.float_value = tmp;
+ } else {
+ ret = std::from_chars(token.str.data() + prefix,
+ token.str.data() + token.str.size(),
+ token.float_value, fmt);
+ token.type = Token::Type::kLiteralDoubleFloatingPoint;
+ }
+
+ if (ret.ec != std::errc()) {
+ // Java allows 0 with just a suffix, std::from_chars does not.
+ if (token.str == "0f" || token.str == "0F") {
+ token.type = Token::Type::kLiteralFloatingPoint;
+ token.float_value = 0;
+ return;
+ }
+ if (token.str == "0d" || token.str == "0D") {
+ token.type = Token::Type::kLiteralDoubleFloatingPoint;
+ token.float_value = 0;
+ return;
+ }
+ errors_->err(location_,
+ std::format("Invalid float literal: {}", token.str));
+ token.type = Token::Type::kError;
+ }
+ }
+
+ std::string_view unescape_if_needed(std::string_view str) {
+ auto back_slash = str.find('\\');
+ if (back_slash == std::string_view::npos)
+ return str;
+ unescape_tmp_.clear();
+ unescape_tmp_.reserve(str.size());
+ size_t last = 0;
+ uint8_t tmp[4];
+ while (true) {
+ unescape_tmp_.append(str, last, back_slash - last);
+ auto ret = unescape(str.substr(back_slash));
+ auto* ptr = tmp;
+ u8::write(ptr, tmp + sizeof(tmp), ret.first);
+ unescape_tmp_.append(
+ std::string_view(reinterpret_cast<char*>(tmp), ptr - tmp));
+ last = back_slash + ret.second;
+ back_slash = str.find('\\', last);
+ if (back_slash == std::string::npos) {
+ unescape_tmp_.append(str, last);
+ break;
+ }
+ }
+ return unescape_tmp_;
+ }
+
+ // Strings coming here have already been validated by the tokenizer
+ static std::pair<uint16_t, size_t> unescape(std::string_view in) {
+ assert(in.front() == '\\');
+ assert(in.size() > 1);
+ switch (in[1]) {
+ case 'b':
+ return std::make_pair(8, 2);
+ case 't':
+ return std::make_pair(9, 2);
+ case 'n':
+ return std::make_pair(10, 2);
+ case 'f':
+ return std::make_pair(12, 2);
+ case 'r':
+ return std::make_pair(13, 2);
+ case '"':
+ return std::make_pair(34, 2);
+ case '\'':
+ return std::make_pair(39, 2);
+ case '\\':
+ return std::make_pair(92, 2);
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7': {
+ uint8_t tmp;
+ auto ret = std::from_chars(in.data() + 1, in.data() + in.size(), tmp,
+ /* base */ 8);
+ return std::make_pair(tmp, ret.ptr - in.data());
+ }
+ default:
+ std::unreachable();
+ }
+ }
+
+ static size_t u8len(std::string_view str) {
+ auto* ptr = reinterpret_cast<uint8_t const*>(str.data());
+ auto* const end = ptr + str.size();
+ size_t count = 0;
+ while (u8::skip(ptr, end))
+ ++count;
+ return count;
+ }
+
+ std::unique_ptr<u8::line::Reader> reader_;
+ std::unique_ptr<src::Errors> errors_;
+ TokensConfig const config_;
+ MatchNext match_next_;
+ std::string_view line_;
+ Location location_;
+ std::string unescape_tmp_;
+};
+
+struct MatchNextJava8 {
+ std::optional<std::pair<java_8::Token, size_t>> operator()(
+ std::string_view str) const {
+ return java_8::matchNext(str);
+ }
+};
+
+} // namespace
+
+std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader,
+ std::unique_ptr<src::Errors> errors,
+ TokensConfig config) {
+ switch (config.version) {
+ case Version::kJava8:
+ return std::make_unique<TokensImpl<MatchNextJava8, java_8::Token>>(
+ std::move(reader), std::move(errors), config);
+ }
+ std::unreachable();
+}
+
+} // namespace java