summaryrefslogtreecommitdiff
path: root/sax/src/decoder.cc
diff options
context:
space:
mode:
Diffstat (limited to 'sax/src/decoder.cc')
-rw-r--r--sax/src/decoder.cc308
1 files changed, 134 insertions, 174 deletions
diff --git a/sax/src/decoder.cc b/sax/src/decoder.cc
index 30b1735..35b9b46 100644
--- a/sax/src/decoder.cc
+++ b/sax/src/decoder.cc
@@ -12,273 +12,233 @@ namespace sax {
namespace {
-class UtfDecoder : public Decoder {
+class KnownEndianDecoder : public Decoder {
public:
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ std::size_t tmp = in_offset;
+ uint32_t ret = read(in, tmp);
+ if (ret == utf::NEED_MORE)
+ return State::NEED_MORE;
+ if (ret == utf::INVALID)
+ return State::INVALID;
+
if (bom_ == -1) UNLIKELY {
- std::size_t tmp = in_offset;
- uint32_t ret = read(in, tmp);
- if (ret == utf::NEED_MORE) {
- return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return State::INVALID;
- }
if (ret == 0xfeff) {
// To allow offset to advance and to return, we need to
// read at least one more character completely.
ret = read(in, tmp);
- if (ret == utf::NEED_MORE) {
+ if (ret == utf::NEED_MORE)
return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
+ if (ret == utf::INVALID)
return State::INVALID;
- }
bom_ = 1;
} else {
bom_ = 0;
}
- in_offset = tmp;
- out[out_offset++] = ret;
- if (out_offset == out_size)
- return State::GOOD;
+ if (!utf::write8(ret, out, out_offset)) {
+ bom_ = -1;
+ return State::NEED_MORE;
+ }
+ } else {
+ if (!utf::write8(ret, out, out_offset))
+ return State::NEED_MORE;
}
+ in_offset = tmp;
- do {
- uint32_t ret = read(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- return State::GOOD;
+ while (true) {
+ ret = read(in, tmp);
+ if (ret == utf::NEED_MORE || ret == utf::INVALID)
+ return State::GOOD;
+ if (!utf::write8(ret, out, out_offset))
+ return State::GOOD;
+ in_offset = tmp;
+ }
}
protected:
- UtfDecoder() = default;
+ KnownEndianDecoder() = default;
- virtual uint32_t read(std::string_view data, std::size_t& offset) const = 0;
+ virtual uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const = 0;
private:
int8_t bom_{-1};
};
-class Utf8Decoder : public UtfDecoder {
+class Utf8Decoder : public KnownEndianDecoder {
public:
Utf8Decoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read8(data, offset);
}
};
-class Utf16BeDecoder : public UtfDecoder {
+class Utf16BeDecoder : public KnownEndianDecoder {
public:
Utf16BeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read16be(data, offset);
}
};
-class Utf16LeDecoder : public UtfDecoder {
+class Utf16LeDecoder : public KnownEndianDecoder {
public:
Utf16LeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read16le(data, offset);
}
};
-class Utf32BeDecoder : public UtfDecoder {
+class Utf32BeDecoder : public KnownEndianDecoder {
public:
Utf32BeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read32be(data, offset);
}
};
-class Utf32LeDecoder : public UtfDecoder {
+class Utf32LeDecoder : public KnownEndianDecoder {
public:
Utf32LeDecoder() = default;
- uint32_t read(std::string_view data, std::size_t& offset) const override {
+ uint32_t read(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
return utf::read32le(data, offset);
}
};
-class Utf16Decoder : public Decoder {
+class UnknownEndianDecoder : public Decoder {
public:
- Utf16Decoder() = default;
-
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ std::size_t tmp = in_offset;
if (endian_ == -1) UNLIKELY {
- std::size_t tmp = in_offset;
- uint32_t ret = utf::read16be(in, tmp);
- int8_t endian;
- if (ret == utf::NEED_MORE) {
+ uint32_t ret = readbe(in, tmp);
+ if (ret == utf::NEED_MORE)
return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
+ if (ret == utf::INVALID)
return State::INVALID;
- }
if (ret == 0xfeff) {
- endian = 1; // Big endian
+ endian_ = 1;
} else if (ret == 0xfffe) {
- endian = 0; // Little endian
+ endian_ = 0;
} else {
return State::INVALID;
}
+ in_offset = tmp;
+ }
- // To allow offset to advance and to return, we need to
- // read at least one more character completely.
- ret = endian == 1 ? utf::read16be(in, tmp) : utf::read16le(in, tmp);
- if (ret == utf::NEED_MORE) {
+ if (endian_ == 0) {
+ uint32_t ret = readle(in, tmp);
+ if (ret == utf::NEED_MORE)
return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
+ if (ret == utf::INVALID)
return State::INVALID;
- }
+ if (!utf::write8(ret, out, out_offset))
+ return State::NEED_MORE;
+ in_offset = tmp;
- endian_ = endian;
+ while (true) {
+ ret = readle(in, tmp);
+ if (ret == utf::NEED_MORE || ret == utf::INVALID)
+ return State::GOOD;
+ if (!utf::write8(ret, out, out_offset))
+ return State::GOOD;
+ in_offset = tmp;
+ }
+ } else /* if (endian_ == 1) */ {
+ uint32_t ret = readbe(in, tmp);
+ if (ret == utf::NEED_MORE)
+ return State::NEED_MORE;
+ if (ret == utf::INVALID)
+ return State::INVALID;
+ if (!utf::write8(ret, out, out_offset))
+ return State::NEED_MORE;
in_offset = tmp;
- out[out_offset++] = ret;
- if (out_offset == out_size)
- return State::GOOD;
- }
- if (endian_ == 1) {
- do {
- uint32_t ret = utf::read16be(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- } else {
- do {
- uint32_t ret = utf::read16le(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
+ while (true) {
+ ret = readbe(in, tmp);
+ if (ret == utf::NEED_MORE || ret == utf::INVALID)
+ return State::GOOD;
+ if (!utf::write8(ret, out, out_offset))
+ return State::GOOD;
+ in_offset = tmp;
+ }
}
- return State::GOOD;
}
+ protected:
+ UnknownEndianDecoder() = default;
+
+ virtual uint32_t readle(
+ std::span<uint8_t const> data, std::size_t& offset) const = 0;
+ virtual uint32_t readbe(
+ std::span<uint8_t const> data, std::size_t& offset) const = 0;
+
private:
int8_t endian_{-1};
};
-class Utf32Decoder : public Decoder {
+class Utf16Decoder : public UnknownEndianDecoder {
public:
- Utf32Decoder() = default;
+ Utf16Decoder() = default;
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
- if (endian_ == -1) UNLIKELY {
- std::size_t tmp = in_offset;
- uint32_t ret = utf::read32be(in, tmp);
- int8_t endian;
- if (ret == utf::NEED_MORE) {
- return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- tmp = in_offset;
- ret = utf::read32le(in, tmp);
- if (ret == 0xfeff) {
- endian = 0; // Little endian
- } else {
- return State::INVALID;
- }
- } else if (ret == 0xfeff) {
- endian = 1; // Big endian
- } else {
- return State::INVALID;
- }
+ uint32_t readle(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read16le(data, offset);
+ }
- // To allow offset to advance and to return, we need to
- // read the next character completely.
- ret = endian == 1 ? utf::read32be(in, tmp) : utf::read32le(in, tmp);
- if (ret == utf::NEED_MORE) {
- return State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return State::INVALID;
- }
+ uint32_t readbe(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read16be(data, offset);
+ }
+};
- endian_ = endian;
- in_offset = tmp;
- out[out_offset++] = ret;
- if (out_offset == out_size)
- return State::GOOD;
- }
+class Utf32Decoder : public UnknownEndianDecoder {
+ public:
+ Utf32Decoder() = default;
- if (endian_ == 1) {
- do {
- uint32_t ret = utf::read32be(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- } else {
- do {
- uint32_t ret = utf::read32le(in, in_offset);
- if (ret == utf::NEED_MORE) {
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- }
- if (ret == utf::INVALID) {
- return out_offset > out_start ? State::GOOD : State::INVALID;
- }
- out[out_offset++] = ret;
- } while (out_offset < out_size);
- }
- return State::GOOD;
+ uint32_t readle(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read32le(data, offset);
}
- private:
- int8_t endian_{-1};
+ uint32_t readbe(
+ std::span<uint8_t const> data, std::size_t& offset) const override {
+ return utf::read32be(data, offset);
+ }
};
class AsciiDecoder : public Decoder {
public:
AsciiDecoder() = default;
- State decode(std::string_view in, std::size_t& in_offset,
- uint32_t* out, std::size_t out_size,
- std::size_t& out_offset) override {
- std::size_t const out_start = out_offset;
- do {
- if (in_offset == in.size())
- return out_offset > out_start ? State::GOOD : State::NEED_MORE;
- if (in[in_offset] & 0x80)
- return out_offset > out_start ? State::GOOD : State::INVALID;
- out[out_offset++] = in[in_offset++];
- } while (out_offset < out_size);
- return State::GOOD;
+ State decode(std::span<uint8_t const> in, std::size_t& in_offset,
+ std::span<uint8_t> out, std::size_t& out_offset) override {
+ if (in_offset >= in.size())
+ return State::NEED_MORE;
+ if (in[in_offset] & 0x80)
+ return State::INVALID;
+ if (!utf::write8(in[in_offset], out, out_offset))
+ return State::NEED_MORE;
+ ++in_offset;
+
+ while (true) {
+ if (in_offset >= in.size() || in[in_offset] & 0x80)
+ return State::GOOD;
+ if (!utf::write8(in[in_offset], out, out_offset))
+ return State::GOOD;
+ ++in_offset;
+ }
}
};