diff --git a/src/Html/HtmlParser.fs b/src/Html/HtmlParser.fs
index 99e1d54ad..933145a11 100644
--- a/src/Html/HtmlParser.fs
+++ b/src/Html/HtmlParser.fs
@@ -339,8 +339,8 @@ module internal HtmlParser =
| "pre" | "code" -> true
| _ -> false
- member x.IsScriptTag
- with get() =
+ member x.IsScriptTag
+ with get() =
match x.CurrentTagName().Trim().ToLower() with
| "script" | "style" -> true
| _ -> false
@@ -570,14 +570,17 @@ module internal HtmlParser =
| '>' when state.IsScriptTag -> state.Pop(); state.EmitTag(true);
| TextParser.Letter _ -> state.ConsTag(); scriptEndTagName state
| _ ->
- state.Cons([|'<'; '/'|]);
- state.Cons(state.CurrentTagName());
+ state.Cons([|'<'; '/'|]);
+ state.Cons(state.CurrentTagName());
(!state.CurrentTag).Clear()
script state
- and charRef state =
+ and charRef state =
match state.Peek() with
| ';' -> state.Cons(); state.Emit()
| '<' -> state.Emit()
+ // System.IO.TextReader.Read() returns -1
+ // at end of stream, and -1 cast to char is \uffff.
+ | '\uffff' -> state.Emit()
| _ -> state.Cons(); charRef state
and tagOpen state =
match state.Peek() with
@@ -846,8 +849,15 @@ module internal HtmlParser =
| TagEnd name :: rest when name <> expectedTagEnd && (name <> (new String(expectedTagEnd.ToCharArray() |> Array.rev))) ->
// ignore this token if not the expected end tag (or it's reverse, eg:
)
parse' docType elements expectedTagEnd parentTagName rest
- | TagEnd _ :: rest ->
+ | TagEnd _ :: rest ->
recursiveReturn (docType, rest, List.rev elements)
+ | Text a :: Text b :: rest ->
+ if a = "" && b = "" then
+ // ignore this token
+ parse' docType elements expectedTagEnd parentTagName rest
+ else
+ let t = HtmlText (a + b)
+ parse' docType (t :: elements) expectedTagEnd parentTagName rest
| Text cont :: rest ->
if cont = "" then
// ignore this token
@@ -863,7 +873,7 @@ module internal HtmlParser =
parse' docType (c :: elements) expectedTagEnd parentTagName rest
| EOF :: _ -> recursiveReturn (docType, [], List.rev elements)
| [] -> recursiveReturn (docType, [], List.rev elements)
- let tokens = tokenise reader
+ let tokens = tokenise reader
let docType, _, elements = tokens |> parse' (new Stack<_>()) "" [] "" ""
if List.isEmpty elements then
failwith "Invalid HTML"
diff --git a/tests/FSharp.Data.Tests/HtmlParser.fs b/tests/FSharp.Data.Tests/HtmlParser.fs
index fab19a0d6..7c7a37e92 100644
--- a/tests/FSharp.Data.Tests/HtmlParser.fs
+++ b/tests/FSharp.Data.Tests/HtmlParser.fs
@@ -882,3 +882,28 @@ let ``Can handle escaped characters in a string inside script tag`` content =
[],
[ HtmlNode.NewText content ]) ]
result |> should equal expected
+
+[]
+let ``Parsing non-html content doesn't cause an infinite loop - Github-1264``() =
+ let content =
+ """Steve Jobs steve@apple.com Education: - Master of Mathematics Honours Computer Science and Combinatorics &
+ Optimization. I
+ specialized in systems and real-time programming, programming language
+ implementation, and mathematical optimization.
+ Skills:
+ - Proficient in Rust, C++, Scheme, x86(_64) LaTeX,
+ (Postgre)SQL, Gurobi, AWS, Google Cloud Platform, .NET (Core), C#,
+ Python, low-level profiling and optimization on Linux and Windows.
+ - Can do things with Java, Haskell, Clojure,
+ Scala, AMPS, redis, OpenGL.
+ Instructional support assistant at the School,
+ September to January 2010.
+ - Started the Java project[3], a custom IDE for students in an
+ introductory computer science course."""
+
+ let result = HtmlDocument.Parse content
+ let expected =
+ HtmlDocument.New [
+ HtmlNode.NewText content
+ ]
+ result |> should equal expected