diff options
Diffstat (limited to 'src/java_tokens.cc')
| -rw-r--r-- | src/java_tokens.cc | 374 |
1 files changed, 374 insertions, 0 deletions
diff --git a/src/java_tokens.cc b/src/java_tokens.cc new file mode 100644 index 0000000..59748c1 --- /dev/null +++ b/src/java_tokens.cc @@ -0,0 +1,374 @@ +#include "java_tokens.hh" + +#include "errors.hh" +#include "java_tokens_java-8.hh" +#include "java_uescape.hh" +#include "str.hh" +#include "u8.hh" +#include "uline.hh" + +#include <cassert> +#include <charconv> +#include <cstddef> +#include <expected> +#include <format> +#include <limits> +#include <memory> +#include <optional> +#include <string> +#include <string_view> +#include <system_error> +#include <utility> + +namespace java { + +namespace { + +template <typename MatchNext, typename MatchToken> +class TokensImpl : public Tokens { + public: + TokensImpl(std::unique_ptr<io::Reader> reader, + std::unique_ptr<src::Errors> errors, TokensConfig config) + : reader_(u8::line::open(u8::java::open(std::move(reader)))), + errors_(std::move(errors)), + config_(config) {} + + std::expected<Token, io::ReadError> read() override { + while (true) { + while (line_.empty()) { + auto maybe_line = reader_->read(); + if (!maybe_line.has_value()) + return std::unexpected(maybe_line.error()); + line_ = maybe_line.value(); + location_.line = reader_->number(); + location_.column = 0; + } + + Token token; + token.loc = location_; + + auto maybe_token_pair = match_next_(line_); + if (maybe_token_pair.has_value()) { + token.str = line_.substr(0, maybe_token_pair->second); + location_.column += u8len(token.str); + line_ = line_.substr(maybe_token_pair->second); + switch (maybe_token_pair->first) { + case MatchToken::kBinaryIntegerLiteral: + handle_int_literal(token, /* base */ 2); + break; + case MatchToken::kBooleanLiteral: + token.type = Token::Type::kLiteralBoolean; + token.int_value = token.str != "false"; + break; + case MatchToken::kCharacterLiteral: + token.type = Token::Type::kLiteralCharacter; + if (token.str[1] == '\\') { + token.int_value = + unescape(token.str.substr(1, token.str.size() - 2)).first; + } else { + auto* ptr = + reinterpret_cast<uint8_t const*>(token.str.data() + 1); + auto* end = ptr + token.str.size() - 2; + token.int_value = u8::read(ptr, end).value(); + } + break; + case MatchToken::kDecimalFloatingPointLiteral: + handle_float_literal(token); + break; + case MatchToken::kDecimalIntegerLiteral: + handle_int_literal(token); + break; + case MatchToken::kEndOfLineComment: + token.type = Token::Type::kComment; + token.str = str::trim(token.str.substr(2)); + break; + case MatchToken::kHexIntegerLiteral: + handle_int_literal(token, /* base */ 16); + break; + case MatchToken::kHexadecimalFloatingPointLiteral: + handle_float_literal(token, /* base */ 16); + break; + case MatchToken::kIdentifier: + token.type = Token::Type::kIdentifier; + break; + case MatchToken::kKeyword: + token.type = Token::Type::kKeyword; + break; + case MatchToken::kNullLiteral: + token.type = Token::Type::kLiteralNull; + break; + case MatchToken::kOctalIntegerLiteral: + handle_int_literal(token, /* base */ 8); + break; + case MatchToken::kOperator: + token.type = Token::Type::kOperator; + break; + case MatchToken::kSeparator: + token.type = Token::Type::kSeparator; + break; + case MatchToken::kStringLiteral: + token.type = Token::Type::kLiteralString; + token.str = + unescape_if_needed(token.str.substr(1, token.str.size() - 2)); + break; + case MatchToken::kTraditionalComment: { + token.type = Token::Type::kComment; + size_t s = 2; + while (s < token.str.size() && token.str[s] == '*') + ++s; + token.str = + str::trim(token.str.substr(s, token.str.size() - 2 - s)); + token.int_value = static_cast<int64_t>(s - 1); + // TODO: handle multiline + break; + } + case MatchToken::kWhiteSpace: + continue; + } + } else { + errors_->err(location_, std::format("Invalid token: {}", line_)); + token.type = Token::Type::kError; + token.str = line_; + } + return token; + } + } + + private: + void handle_int_literal_error(Token& token, std::string_view str, + std::errc err, int base) { + if (err == std::errc::result_out_of_range) { + // Java assumes two completent (so 0xffff_ffff is -1) and also, negative literals + // are read as positive (because the operator '-' is a separate token) + uint64_t tmp; + auto ret = + std::from_chars(str.data(), str.data() + str.size(), tmp, base); + if (ret.ec == std::errc()) { + token.type = ret.ptr < str.data() + str.size() + ? Token::Type::kLiteralLong + : Token::Type::kLiteralInt; + token.int_value = static_cast<int64_t>(tmp); + return; + } + } + errors_->err(location_, + std::format("Invalid integer literal: {}", token.str)); + token.type = Token::Type::kError; + } + + void handle_int_literal(Token& token, int base = 10) { + size_t prefix; + switch (base) { + case 16: // 0x + case 2: // 0b + prefix = 2; + break; + case 8: // 0 + prefix = 1; + break; + default: + prefix = 0; + break; + } + std::optional<char> suffix; + if (token.str.find('_') == std::string_view::npos) { + auto ret = std::from_chars(token.str.data() + prefix, + token.str.data() + token.str.size(), + token.int_value, base); + if (ret.ec != std::errc()) { + handle_int_literal_error(token, token.str.substr(prefix), ret.ec, base); + return; + } + if (ret.ptr < token.str.data() + token.str.size()) + suffix = *ret.ptr; + } else { + std::string tmp; + tmp.reserve(token.str.size() - prefix); + for (size_t i = prefix; i < token.str.size(); ++i) { + if (token.str[i] != '_') { + tmp.push_back(token.str[i]); + } + } + auto ret = std::from_chars(tmp.data(), tmp.data() + tmp.size(), + token.int_value, base); + if (ret.ec != std::errc()) { + handle_int_literal_error(token, tmp, ret.ec, base); + return; + } + if (ret.ptr < tmp.data() + tmp.size()) + suffix = *ret.ptr; + } + if (suffix.has_value() && + (suffix.value() == 'l' || suffix.value() == 'L')) { + token.type = Token::Type::kLiteralLong; + } else { + if (base == 10 && + token.int_value > + static_cast<int64_t>(1) + std::numeric_limits<int32_t>::max()) { + errors_->err(location_, + std::format("Invalid integer literal: {}", token.str)); + token.type = Token::Type::kError; + return; + } + if (std::cmp_greater(token.int_value, + std::numeric_limits<uint32_t>::max())) { + errors_->err(location_, + std::format("Invalid integer literal: {}", token.str)); + token.type = Token::Type::kError; + return; + } + token.type = Token::Type::kLiteralInt; + token.int_value = static_cast<int32_t>(token.int_value); + } + } + + void handle_float_literal(Token& token, int base = 10) { + size_t prefix; + std::chars_format fmt; + switch (base) { + case 16: // 0x + fmt = std::chars_format::general | std::chars_format::hex; + prefix = 2; + break; + default: + fmt = std::chars_format::general; + prefix = 0; + break; + } + + std::from_chars_result ret; + if (token.str.ends_with("f") || token.str.ends_with("F")) { + // float and double do not parse exactly the same, so use a float parser for float. + float tmp; + ret = std::from_chars(token.str.data() + prefix, + token.str.data() + token.str.size(), tmp, fmt); + token.type = Token::Type::kLiteralFloatingPoint; + token.float_value = tmp; + } else { + ret = std::from_chars(token.str.data() + prefix, + token.str.data() + token.str.size(), + token.float_value, fmt); + token.type = Token::Type::kLiteralDoubleFloatingPoint; + } + + if (ret.ec != std::errc()) { + // Java allows 0 with just a suffix, std::from_chars does not. + if (token.str == "0f" || token.str == "0F") { + token.type = Token::Type::kLiteralFloatingPoint; + token.float_value = 0; + return; + } + if (token.str == "0d" || token.str == "0D") { + token.type = Token::Type::kLiteralDoubleFloatingPoint; + token.float_value = 0; + return; + } + errors_->err(location_, + std::format("Invalid float literal: {}", token.str)); + token.type = Token::Type::kError; + } + } + + std::string_view unescape_if_needed(std::string_view str) { + auto back_slash = str.find('\\'); + if (back_slash == std::string_view::npos) + return str; + unescape_tmp_.clear(); + unescape_tmp_.reserve(str.size()); + size_t last = 0; + uint8_t tmp[4]; + while (true) { + unescape_tmp_.append(str, last, back_slash - last); + auto ret = unescape(str.substr(back_slash)); + auto* ptr = tmp; + u8::write(ptr, tmp + sizeof(tmp), ret.first); + unescape_tmp_.append( + std::string_view(reinterpret_cast<char*>(tmp), ptr - tmp)); + last = back_slash + ret.second; + back_slash = str.find('\\', last); + if (back_slash == std::string::npos) { + unescape_tmp_.append(str, last); + break; + } + } + return unescape_tmp_; + } + + // Strings coming here have already been validated by the tokenizer + static std::pair<uint16_t, size_t> unescape(std::string_view in) { + assert(in.front() == '\\'); + assert(in.size() > 1); + switch (in[1]) { + case 'b': + return std::make_pair(8, 2); + case 't': + return std::make_pair(9, 2); + case 'n': + return std::make_pair(10, 2); + case 'f': + return std::make_pair(12, 2); + case 'r': + return std::make_pair(13, 2); + case '"': + return std::make_pair(34, 2); + case '\'': + return std::make_pair(39, 2); + case '\\': + return std::make_pair(92, 2); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': { + uint8_t tmp; + auto ret = std::from_chars(in.data() + 1, in.data() + in.size(), tmp, + /* base */ 8); + return std::make_pair(tmp, ret.ptr - in.data()); + } + default: + std::unreachable(); + } + } + + static size_t u8len(std::string_view str) { + auto* ptr = reinterpret_cast<uint8_t const*>(str.data()); + auto* const end = ptr + str.size(); + size_t count = 0; + while (u8::skip(ptr, end)) + ++count; + return count; + } + + std::unique_ptr<u8::line::Reader> reader_; + std::unique_ptr<src::Errors> errors_; + TokensConfig const config_; + MatchNext match_next_; + std::string_view line_; + Location location_; + std::string unescape_tmp_; +}; + +struct MatchNextJava8 { + std::optional<std::pair<java_8::Token, size_t>> operator()( + std::string_view str) const { + return java_8::matchNext(str); + } +}; + +} // namespace + +std::unique_ptr<Tokens> open(std::unique_ptr<io::Reader> reader, + std::unique_ptr<src::Errors> errors, + TokensConfig config) { + switch (config.version) { + case Version::kJava8: + return std::make_unique<TokensImpl<MatchNextJava8, java_8::Token>>( + std::move(reader), std::move(errors), config); + } + std::unreachable(); +} + +} // namespace java |
