Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions src/Html/HtmlParser.fs
Original file line number Diff line number Diff line change
Expand Up @@ -339,8 +339,8 @@ module internal HtmlParser =
| "pre" | "code" -> true
| _ -> false

member x.IsScriptTag
with get() =
member x.IsScriptTag
with get() =
match x.CurrentTagName().Trim().ToLower() with
| "script" | "style" -> true
| _ -> false
Expand Down Expand Up @@ -570,14 +570,17 @@ module internal HtmlParser =
| '>' when state.IsScriptTag -> state.Pop(); state.EmitTag(true);
| TextParser.Letter _ -> state.ConsTag(); scriptEndTagName state
| _ ->
state.Cons([|'<'; '/'|]);
state.Cons(state.CurrentTagName());
state.Cons([|'<'; '/'|]);
state.Cons(state.CurrentTagName());
(!state.CurrentTag).Clear()
script state
and charRef state =
and charRef state =
match state.Peek() with
| ';' -> state.Cons(); state.Emit()
| '<' -> state.Emit()
// System.IO.TextReader.Read() returns -1
// at end of stream, and -1 cast to char is \uffff.
| '\uffff' -> state.Emit()
| _ -> state.Cons(); charRef state
and tagOpen state =
match state.Peek() with
Expand Down Expand Up @@ -846,8 +849,15 @@ module internal HtmlParser =
| TagEnd name :: rest when name <> expectedTagEnd && (name <> (new String(expectedTagEnd.ToCharArray() |> Array.rev))) ->
// ignore this token if not the expected end tag (or it's reverse, eg: <li></il>)
parse' docType elements expectedTagEnd parentTagName rest
| TagEnd _ :: rest ->
| TagEnd _ :: rest ->
recursiveReturn (docType, rest, List.rev elements)
| Text a :: Text b :: rest ->
if a = "" && b = "" then
// ignore this token
parse' docType elements expectedTagEnd parentTagName rest
else
let t = HtmlText (a + b)
parse' docType (t :: elements) expectedTagEnd parentTagName rest
| Text cont :: rest ->
if cont = "" then
// ignore this token
Expand All @@ -863,7 +873,7 @@ module internal HtmlParser =
parse' docType (c :: elements) expectedTagEnd parentTagName rest
| EOF :: _ -> recursiveReturn (docType, [], List.rev elements)
| [] -> recursiveReturn (docType, [], List.rev elements)
let tokens = tokenise reader
let tokens = tokenise reader
let docType, _, elements = tokens |> parse' (new Stack<_>()) "" [] "" ""
if List.isEmpty elements then
failwith "Invalid HTML"
Expand Down
25 changes: 25 additions & 0 deletions tests/FSharp.Data.Tests/HtmlParser.fs
Original file line number Diff line number Diff line change
Expand Up @@ -882,3 +882,28 @@ let ``Can handle escaped characters in a string inside script tag`` content =
[],
[ HtmlNode.NewText content ]) ]
result |> should equal expected

[<Test>]
let ``Parsing non-html content doesn't cause an infinite loop - Github-1264``() =
let content =
"""Steve Jobs steve@apple.com Education: - Master of Mathematics Honours Computer Science and Combinatorics &
Optimization. I
specialized in systems and real-time programming, programming language
implementation, and mathematical optimization.
Skills:
- Proficient in Rust, C++, Scheme, x86(_64) LaTeX,
(Postgre)SQL, Gurobi, AWS, Google Cloud Platform, .NET (Core), C#,
Python, low-level profiling and optimization on Linux and Windows.
- Can do things with Java, Haskell, Clojure,
Scala, AMPS, redis, OpenGL.
Instructional support assistant at the School,
September to January 2010.
- Started the Java project[3], a custom IDE for students in an
introductory computer science course."""

let result = HtmlDocument.Parse content
let expected =
HtmlDocument.New [
HtmlNode.NewText content
]
result |> should equal expected