#include "java_tokens.hh" #include "errors.hh" #include "java_tokens_java-8.hh" #include "java_uescape.hh" #include "str.hh" #include "u8.hh" #include "uline.hh" #include #include #include #include #include #include #include #include #include #include #include #include namespace java { namespace { template class TokensImpl : public Tokens { public: TokensImpl(std::unique_ptr reader, std::unique_ptr errors, TokensConfig config) : reader_(u8::line::open(u8::java::open(std::move(reader)))), errors_(std::move(errors)), config_(config) {} std::expected read() override { while (true) { while (line_.empty()) { auto maybe_line = reader_->read(); if (!maybe_line.has_value()) return std::unexpected(maybe_line.error()); line_ = maybe_line.value(); location_.line = reader_->number(); location_.column = 0; } Token token; token.loc = location_; auto maybe_token_pair = matcher_.matchNext(line_); while (matcher_.line_end_reached_) { // Matcher wants more lines. matcher_.line_end_reached_ = false; bool got_any = false; line_tmp_ = line_; line_tmp_.push_back('\n'); while (true) { auto maybe_line = reader_->read(); if (!maybe_line.has_value()) break; line_tmp_.append(maybe_line.value()); got_any = true; // Simple check, it might not actually end the comment but if so tokenizer will complain // about reaching line_end again. if (maybe_line->contains("*/")) break; line_tmp_.push_back('\n'); } line_ = line_tmp_; maybe_token_pair = matcher_.matchNext(line_); if (!got_any) break; } if (maybe_token_pair.has_value()) { token.str = line_.substr(0, maybe_token_pair->second); location_.column += u8len(token.str); line_ = line_.substr(maybe_token_pair->second); switch (maybe_token_pair->first) { case MatchToken::kBinaryIntegerLiteral: handle_int_literal(token, /* base */ 2); break; case MatchToken::kBooleanLiteral: token.type = Token::Type::kLiteralBoolean; token.int_value = token.str != "false"; break; case MatchToken::kCharacterLiteral: token.type = Token::Type::kLiteralCharacter; if (token.str[1] == '\\') { token.int_value = unescape(token.str.substr(1, token.str.size() - 2)).first; } else { auto* ptr = reinterpret_cast(token.str.data() + 1); auto* end = ptr + token.str.size() - 2; token.int_value = u8::read(ptr, end).value(); } break; case MatchToken::kDecimalFloatingPointLiteral: handle_float_literal(token); break; case MatchToken::kDecimalIntegerLiteral: handle_int_literal(token); break; case MatchToken::kEndOfLineComment: token.type = Token::Type::kComment; token.str = str::trim(token.str.substr(2)); break; case MatchToken::kHexIntegerLiteral: handle_int_literal(token, /* base */ 16); break; case MatchToken::kHexadecimalFloatingPointLiteral: handle_float_literal(token, /* base */ 16); break; case MatchToken::kIdentifier: token.type = Token::Type::kIdentifier; break; case MatchToken::kKeyword: token.type = Token::Type::kKeyword; break; case MatchToken::kNullLiteral: token.type = Token::Type::kLiteralNull; break; case MatchToken::kOctalIntegerLiteral: handle_int_literal(token, /* base */ 8); break; case MatchToken::kOperator: token.type = Token::Type::kOperator; break; case MatchToken::kSeparator: token.type = Token::Type::kSeparator; break; case MatchToken::kStringLiteral: token.type = Token::Type::kLiteralString; token.str = unescape_if_needed(token.str.substr(1, token.str.size() - 2)); break; case MatchToken::kTraditionalComment: { token.type = Token::Type::kComment; size_t s = 2; while (s < token.str.size() && token.str[s] == '*') ++s; token.str = str::trim(token.str.substr(s, token.str.size() - 2 - s)); token.int_value = static_cast(s - 1); break; } case MatchToken::kWhiteSpace: continue; } } else { errors_->err(location_, std::format("Invalid token: {}", line_)); token.type = Token::Type::kError; token.str = line_; } return token; } } private: void handle_int_literal_error(Token& token, std::string_view str, std::errc err, int base) { if (err == std::errc::result_out_of_range) { // Java assumes two completent (so 0xffff_ffff is -1) and also, negative literals // are read as positive (because the operator '-' is a separate token) uint64_t tmp; auto ret = std::from_chars(str.data(), str.data() + str.size(), tmp, base); if (ret.ec == std::errc()) { token.type = ret.ptr < str.data() + str.size() ? Token::Type::kLiteralLong : Token::Type::kLiteralInt; token.int_value = static_cast(tmp); return; } } errors_->err(location_, std::format("Invalid integer literal: {}", token.str)); token.type = Token::Type::kError; } void handle_int_literal(Token& token, int base = 10) { size_t prefix; switch (base) { case 16: // 0x case 2: // 0b prefix = 2; break; case 8: // 0 prefix = 1; break; default: prefix = 0; break; } std::optional suffix; if (token.str.find('_') == std::string_view::npos) { auto ret = std::from_chars(token.str.data() + prefix, token.str.data() + token.str.size(), token.int_value, base); if (ret.ec != std::errc()) { handle_int_literal_error(token, token.str.substr(prefix), ret.ec, base); return; } if (ret.ptr < token.str.data() + token.str.size()) suffix = *ret.ptr; } else { std::string tmp; tmp.reserve(token.str.size() - prefix); for (size_t i = prefix; i < token.str.size(); ++i) { if (token.str[i] != '_') { tmp.push_back(token.str[i]); } } auto ret = std::from_chars(tmp.data(), tmp.data() + tmp.size(), token.int_value, base); if (ret.ec != std::errc()) { handle_int_literal_error(token, tmp, ret.ec, base); return; } if (ret.ptr < tmp.data() + tmp.size()) suffix = *ret.ptr; } if (suffix.has_value() && (suffix.value() == 'l' || suffix.value() == 'L')) { token.type = Token::Type::kLiteralLong; } else { if (base == 10 && token.int_value > static_cast(1) + std::numeric_limits::max()) { errors_->err(location_, std::format("Invalid integer literal: {}", token.str)); token.type = Token::Type::kError; return; } if (std::cmp_greater(token.int_value, std::numeric_limits::max())) { errors_->err(location_, std::format("Invalid integer literal: {}", token.str)); token.type = Token::Type::kError; return; } token.type = Token::Type::kLiteralInt; token.int_value = static_cast(token.int_value); } } void handle_float_literal(Token& token, int base = 10) { size_t prefix; std::chars_format fmt; switch (base) { case 16: // 0x fmt = std::chars_format::general | std::chars_format::hex; prefix = 2; break; default: fmt = std::chars_format::general; prefix = 0; break; } std::from_chars_result ret; if (token.str.ends_with("f") || token.str.ends_with("F")) { // float and double do not parse exactly the same, so use a float parser for float. float tmp; ret = std::from_chars(token.str.data() + prefix, token.str.data() + token.str.size(), tmp, fmt); token.type = Token::Type::kLiteralFloatingPoint; token.float_value = tmp; } else { ret = std::from_chars(token.str.data() + prefix, token.str.data() + token.str.size(), token.float_value, fmt); token.type = Token::Type::kLiteralDoubleFloatingPoint; } if (ret.ec != std::errc()) { // Java allows 0 with just a suffix, std::from_chars does not. if (token.str == "0f" || token.str == "0F") { token.type = Token::Type::kLiteralFloatingPoint; token.float_value = 0; return; } if (token.str == "0d" || token.str == "0D") { token.type = Token::Type::kLiteralDoubleFloatingPoint; token.float_value = 0; return; } errors_->err(location_, std::format("Invalid float literal: {}", token.str)); token.type = Token::Type::kError; } } std::string_view unescape_if_needed(std::string_view str) { auto back_slash = str.find('\\'); if (back_slash == std::string_view::npos) return str; unescape_tmp_.clear(); unescape_tmp_.reserve(str.size()); size_t last = 0; uint8_t tmp[4]; while (true) { unescape_tmp_.append(str, last, back_slash - last); auto ret = unescape(str.substr(back_slash)); auto* ptr = tmp; u8::write(ptr, tmp + sizeof(tmp), ret.first); unescape_tmp_.append( std::string_view(reinterpret_cast(tmp), ptr - tmp)); last = back_slash + ret.second; back_slash = str.find('\\', last); if (back_slash == std::string::npos) { unescape_tmp_.append(str, last); break; } } return unescape_tmp_; } // Strings coming here have already been validated by the tokenizer static std::pair unescape(std::string_view in) { assert(in.front() == '\\'); assert(in.size() > 1); switch (in[1]) { case 'b': return std::make_pair(8, 2); case 't': return std::make_pair(9, 2); case 'n': return std::make_pair(10, 2); case 'f': return std::make_pair(12, 2); case 'r': return std::make_pair(13, 2); case '"': return std::make_pair(34, 2); case '\'': return std::make_pair(39, 2); case '\\': return std::make_pair(92, 2); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { uint8_t tmp; auto ret = std::from_chars(in.data() + 1, in.data() + in.size(), tmp, /* base */ 8); return std::make_pair(tmp, ret.ptr - in.data()); } default: std::unreachable(); } } static size_t u8len(std::string_view str) { auto* ptr = reinterpret_cast(str.data()); auto* const end = ptr + str.size(); size_t count = 0; while (u8::skip(ptr, end)) ++count; return count; } std::unique_ptr reader_; std::unique_ptr errors_; TokensConfig const config_; TokenMatcher matcher_; std::string_view line_; std::string line_tmp_; Location location_; std::string unescape_tmp_; }; } // namespace std::unique_ptr open(std::unique_ptr reader, std::unique_ptr errors, TokensConfig config) { switch (config.version) { case Version::kJava8: return std::make_unique>( std::move(reader), std::move(errors), config); } std::unreachable(); } } // namespace java