UTF8Decoder.js 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. import { decoderError } from "../../encoding/encodings";
  2. import { finished } from "../../encoding/finished";
  3. import { end_of_stream } from "../../encoding/terminology";
  4. import { inRange } from "../../encoding/utilities";
  5. /**
  6. * @constructor
  7. * @implements {Decoder}
  8. * @param {{fatal: boolean}} options
  9. */
  10. var UTF8Decoder = /** @class */ (function () {
  11. function UTF8Decoder(options) {
  12. this.fatal = options.fatal;
  13. // utf-8's decoder's has an associated utf-8 code point, utf-8
  14. // bytes seen, and utf-8 bytes needed (all initially 0), a utf-8
  15. // lower boundary (initially 0x80), and a utf-8 upper boundary
  16. // (initially 0xBF).
  17. /** @type {number} */ this.utf8_code_point = 0,
  18. /** @type {number} */ this.utf8_bytes_seen = 0,
  19. /** @type {number} */ this.utf8_bytes_needed = 0,
  20. /** @type {number} */ this.utf8_lower_boundary = 0x80,
  21. /** @type {number} */ this.utf8_upper_boundary = 0xBF;
  22. }
  23. /**
  24. * @param {Stream} stream The stream of bytes being decoded.
  25. * @param {number} bite The next byte read from the stream.
  26. * @return {?(number|!Array.<number>)} The next code point(s)
  27. * decoded, or null if not enough data exists in the input
  28. * stream to decode a complete code point.
  29. */
  30. UTF8Decoder.prototype.handler = function (stream, bite) {
  31. // 1. If byte is end-of-stream and utf-8 bytes needed is not 0,
  32. // set utf-8 bytes needed to 0 and return error.
  33. if (bite === end_of_stream && this.utf8_bytes_needed !== 0) {
  34. this.utf8_bytes_needed = 0;
  35. return decoderError(this.fatal);
  36. }
  37. // 2. If byte is end-of-stream, return finished.
  38. if (bite === end_of_stream)
  39. return finished;
  40. // 3. If utf-8 bytes needed is 0, based on byte:
  41. if (this.utf8_bytes_needed === 0) {
  42. // 0x00 to 0x7F
  43. if (inRange(bite, 0x00, 0x7F)) {
  44. // Return a code point whose value is byte.
  45. return bite;
  46. }
  47. // 0xC2 to 0xDF
  48. else if (inRange(bite, 0xC2, 0xDF)) {
  49. // 1. Set utf-8 bytes needed to 1.
  50. this.utf8_bytes_needed = 1;
  51. // 2. Set UTF-8 code point to byte & 0x1F.
  52. this.utf8_code_point = bite & 0x1F;
  53. }
  54. // 0xE0 to 0xEF
  55. else if (inRange(bite, 0xE0, 0xEF)) {
  56. // 1. If byte is 0xE0, set utf-8 lower boundary to 0xA0.
  57. if (bite === 0xE0)
  58. this.utf8_lower_boundary = 0xA0;
  59. // 2. If byte is 0xED, set utf-8 upper boundary to 0x9F.
  60. if (bite === 0xED)
  61. this.utf8_upper_boundary = 0x9F;
  62. // 3. Set utf-8 bytes needed to 2.
  63. this.utf8_bytes_needed = 2;
  64. // 4. Set UTF-8 code point to byte & 0xF.
  65. this.utf8_code_point = bite & 0xF;
  66. }
  67. // 0xF0 to 0xF4
  68. else if (inRange(bite, 0xF0, 0xF4)) {
  69. // 1. If byte is 0xF0, set utf-8 lower boundary to 0x90.
  70. if (bite === 0xF0)
  71. this.utf8_lower_boundary = 0x90;
  72. // 2. If byte is 0xF4, set utf-8 upper boundary to 0x8F.
  73. if (bite === 0xF4)
  74. this.utf8_upper_boundary = 0x8F;
  75. // 3. Set utf-8 bytes needed to 3.
  76. this.utf8_bytes_needed = 3;
  77. // 4. Set UTF-8 code point to byte & 0x7.
  78. this.utf8_code_point = bite & 0x7;
  79. }
  80. // Otherwise
  81. else {
  82. // Return error.
  83. return decoderError(this.fatal);
  84. }
  85. // Return continue.
  86. return null;
  87. }
  88. // 4. If byte is not in the range utf-8 lower boundary to utf-8
  89. // upper boundary, inclusive, run these substeps:
  90. if (!inRange(bite, this.utf8_lower_boundary, this.utf8_upper_boundary)) {
  91. // 1. Set utf-8 code point, utf-8 bytes needed, and utf-8
  92. // bytes seen to 0, set utf-8 lower boundary to 0x80, and set
  93. // utf-8 upper boundary to 0xBF.
  94. this.utf8_code_point = this.utf8_bytes_needed = this.utf8_bytes_seen = 0;
  95. this.utf8_lower_boundary = 0x80;
  96. this.utf8_upper_boundary = 0xBF;
  97. // 2. Prepend byte to stream.
  98. stream.prepend(bite);
  99. // 3. Return error.
  100. return decoderError(this.fatal);
  101. }
  102. // 5. Set utf-8 lower boundary to 0x80 and utf-8 upper boundary
  103. // to 0xBF.
  104. this.utf8_lower_boundary = 0x80;
  105. this.utf8_upper_boundary = 0xBF;
  106. // 6. Set UTF-8 code point to (UTF-8 code point << 6) | (byte &
  107. // 0x3F)
  108. this.utf8_code_point = (this.utf8_code_point << 6) | (bite & 0x3F);
  109. // 7. Increase utf-8 bytes seen by one.
  110. this.utf8_bytes_seen += 1;
  111. // 8. If utf-8 bytes seen is not equal to utf-8 bytes needed,
  112. // continue.
  113. if (this.utf8_bytes_seen !== this.utf8_bytes_needed)
  114. return null;
  115. // 9. Let code point be utf-8 code point.
  116. var code_point = this.utf8_code_point;
  117. // 10. Set utf-8 code point, utf-8 bytes needed, and utf-8 bytes
  118. // seen to 0.
  119. this.utf8_code_point = this.utf8_bytes_needed = this.utf8_bytes_seen = 0;
  120. // 11. Return a code point whose value is code point.
  121. return code_point;
  122. };
  123. return UTF8Decoder;
  124. }());
  125. export { UTF8Decoder };
  126. //# sourceMappingURL=UTF8Decoder.js.map