11import 'dart:collection' ;
2+ import 'dart:convert' show ascii, utf8;
23
34import 'package:source_span/source_span.dart' ;
45
5- import 'char_encodings.dart' ;
66import 'constants.dart' ;
77import 'encoding_parser.dart' ;
88import 'utils.dart' ;
@@ -66,7 +66,7 @@ class HtmlInputStream {
6666 this .sourceUrl])
6767 : charEncodingName = codecName (encoding) {
6868 if (source is String ) {
69- _rawChars = toCodepoints ( source);
69+ _rawChars = source.runes. toList ( );
7070 charEncodingName = 'utf-8' ;
7171 charEncodingCertain = true ;
7272 } else if (source is List <int >) {
@@ -92,7 +92,7 @@ class HtmlInputStream {
9292 _chars = < int > [];
9393
9494 if (_rawChars == null ) {
95- _rawChars = decodeBytes (charEncodingName, _rawBytes);
95+ _rawChars = _decodeBytes (charEncodingName, _rawBytes);
9696 }
9797
9898 bool skipNewline = false ;
@@ -177,7 +177,7 @@ class HtmlInputStream {
177177 /// encoding otherwise return null.
178178 String detectBOM () {
179179 // Try detecting the BOM using bytes from the string
180- if (hasUtf8Bom (_rawBytes)) {
180+ if (_hasUtf8Bom (_rawBytes)) {
181181 return 'utf-8' ;
182182 }
183183 return null ;
@@ -292,3 +292,32 @@ String codecName(String encoding) {
292292 var canonicalName = encoding.replaceAll (asciiPunctuation, '' ).toLowerCase ();
293293 return encodings[canonicalName];
294294}
295+
296+ /// Returns true if the [bytes] starts with a UTF-8 byte order mark.
297+ /// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
298+ /// used in HTML to detect the UTF-
299+ bool _hasUtf8Bom (List <int > bytes, [int offset = 0 , int length]) {
300+ int end = length != null ? offset + length : bytes.length;
301+ return (offset + 3 ) <= end &&
302+ bytes[offset] == 0xEF &&
303+ bytes[offset + 1 ] == 0xBB &&
304+ bytes[offset + 2 ] == 0xBF ;
305+ }
306+
307+ /// Decodes the [bytes] with the provided [encoding] and returns an iterable for
308+ /// the codepoints. Supports the major unicode encodings as well as ascii and
309+ /// and windows-1252 encodings.
310+ Iterable <int > _decodeBytes (String encoding, List <int > bytes) {
311+ switch (encoding) {
312+ case 'ascii' :
313+ return ascii.decode (bytes).runes;
314+
315+ case 'utf-8' :
316+ // NOTE: To match the behavior of the other decode functions, we eat the
317+ // UTF-8 BOM here. This is the default behavior of `utf8.decode`.
318+ return utf8.decode (bytes).runes;
319+
320+ default :
321+ throw ArgumentError ('Encoding $encoding not supported' );
322+ }
323+ }
0 commit comments