diff --git a/src/Html/HtmlParser.fs b/src/Html/HtmlParser.fs index 99e1d54ad..933145a11 100644 --- a/src/Html/HtmlParser.fs +++ b/src/Html/HtmlParser.fs @@ -339,8 +339,8 @@ module internal HtmlParser = | "pre" | "code" -> true | _ -> false - member x.IsScriptTag - with get() = + member x.IsScriptTag + with get() = match x.CurrentTagName().Trim().ToLower() with | "script" | "style" -> true | _ -> false @@ -570,14 +570,17 @@ module internal HtmlParser = | '>' when state.IsScriptTag -> state.Pop(); state.EmitTag(true); | TextParser.Letter _ -> state.ConsTag(); scriptEndTagName state | _ -> - state.Cons([|'<'; '/'|]); - state.Cons(state.CurrentTagName()); + state.Cons([|'<'; '/'|]); + state.Cons(state.CurrentTagName()); (!state.CurrentTag).Clear() script state - and charRef state = + and charRef state = match state.Peek() with | ';' -> state.Cons(); state.Emit() | '<' -> state.Emit() + // System.IO.TextReader.Read() returns -1 + // at end of stream, and -1 cast to char is \uffff. + | '\uffff' -> state.Emit() | _ -> state.Cons(); charRef state and tagOpen state = match state.Peek() with @@ -846,8 +849,15 @@ module internal HtmlParser = | TagEnd name :: rest when name <> expectedTagEnd && (name <> (new String(expectedTagEnd.ToCharArray() |> Array.rev))) -> // ignore this token if not the expected end tag (or it's reverse, eg:
  • ) parse' docType elements expectedTagEnd parentTagName rest - | TagEnd _ :: rest -> + | TagEnd _ :: rest -> recursiveReturn (docType, rest, List.rev elements) + | Text a :: Text b :: rest -> + if a = "" && b = "" then + // ignore this token + parse' docType elements expectedTagEnd parentTagName rest + else + let t = HtmlText (a + b) + parse' docType (t :: elements) expectedTagEnd parentTagName rest | Text cont :: rest -> if cont = "" then // ignore this token @@ -863,7 +873,7 @@ module internal HtmlParser = parse' docType (c :: elements) expectedTagEnd parentTagName rest | EOF :: _ -> recursiveReturn (docType, [], List.rev elements) | [] -> recursiveReturn (docType, [], List.rev elements) - let tokens = tokenise reader + let tokens = tokenise reader let docType, _, elements = tokens |> parse' (new Stack<_>()) "" [] "" "" if List.isEmpty elements then failwith "Invalid HTML" diff --git a/tests/FSharp.Data.Tests/HtmlParser.fs b/tests/FSharp.Data.Tests/HtmlParser.fs index fab19a0d6..7c7a37e92 100644 --- a/tests/FSharp.Data.Tests/HtmlParser.fs +++ b/tests/FSharp.Data.Tests/HtmlParser.fs @@ -882,3 +882,28 @@ let ``Can handle escaped characters in a string inside script tag`` content = [], [ HtmlNode.NewText content ]) ] result |> should equal expected + +[] +let ``Parsing non-html content doesn't cause an infinite loop - Github-1264``() = + let content = + """Steve Jobs steve@apple.com Education: - Master of Mathematics Honours Computer Science and Combinatorics & + Optimization. I + specialized in systems and real-time programming, programming language + implementation, and mathematical optimization. + Skills: + - Proficient in Rust, C++, Scheme, x86(_64) LaTeX, + (Postgre)SQL, Gurobi, AWS, Google Cloud Platform, .NET (Core), C#, + Python, low-level profiling and optimization on Linux and Windows. + - Can do things with Java, Haskell, Clojure, + Scala, AMPS, redis, OpenGL. + Instructional support assistant at the School, + September to January 2010. + - Started the Java project[3], a custom IDE for students in an + introductory computer science course.""" + + let result = HtmlDocument.Parse content + let expected = + HtmlDocument.New [ + HtmlNode.NewText content + ] + result |> should equal expected