Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions libraries/from-bodyxml/go/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# XML to Content Tree Transformer

## Overview
The Transformer converts external XHTML-formatted document into content tree.
It supports format stored in the **internalComponent** collection as well as the one returned by the **Internal Content API**.
The latter is produced by the content-public-read service after applying certain transformations to the bodyXML it retrieves from the internalComponents collection.
These transformations include renaming the content, related, and concept tags to ft-content, ft-related, and ft-concept, respectively, and replacing the id attribute with url, with a few caveats.
The Transformer converts external XHTML-formatted document into content tree. It supports the bodyXML format used in the main content store within the Content & Metadata platform — specifically, in the **internalComponent** collection.


## Usage

Expand All @@ -28,4 +26,9 @@ func main() {

fmt.Printf("Transformed content tree: %+v\n", out)
}
```
```

## Known Limitations and Behavior
The current implementation of the transformer has the following limitations:
- If the transformer encounters an HTML tag that does not have a corresponding definition in the content tree, that tag is skipped.
- If an HTML element contains child elements that are not allowed, those disallowed children are ignored.
27 changes: 0 additions & 27 deletions libraries/from-bodyxml/go/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,30 +75,3 @@ func valueOr(v, fallback string) string {
func attr(el *etree.Element, name string) string {
return el.SelectAttrValue(name, "")
}

var contentTypeTemplates = map[string]string{
"http://www.ft.com/ontology/content/Article": "/content/{{id}}",
"http://www.ft.com/ontology/content/ImageSet": "/content/{{id}}",
"http://www.ft.com/ontology/content/ClipSet": "/content/{{id}}",
"http://www.ft.com/ontology/content/CustomCodeComponent": "/content/{{id}}",
"http://www.ft.com/ontology/content/MediaResource": "/content/{{id}}",
"http://www.ft.com/ontology/content/Video": "/content/{{id}}",
"http://www.ft.com/ontology/company/PublicCompany": "/organisations/{{id}}",
"http://www.ft.com/ontology/content/ContentPackage": "/content/{{id}}",
"http://www.ft.com/ontology/content/Content": "/content/{{id}}",
"http://www.ft.com/ontology/content/Image": "/content/{{id}}",
"http://www.ft.com/ontology/content/DynamicContent": "/content/{{id}}",
"http://www.ft.com/ontology/content/Graphic": "/content/{{id}}",
"http://www.ft.com/ontology/content/Audio": "/content/{{id}}",
"http://www.ft.com/ontology/company/Organisation": "/organisations/{{id}}",
}

func generateUrl(t, id string) string {
const host = "http://api.ft.com"
template, ok := contentTypeTemplates[t]
if !ok {
return ""
}
path := strings.Replace(template, "{{id}}", id, 1)
return host + path
}
57 changes: 15 additions & 42 deletions libraries/from-bodyxml/go/html_transformers.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ type transformer func(el *etree.Element) contenttree.Node

var defaultTransformers = map[string]transformer{
"h1": func(h1 *etree.Element) contenttree.Node {
dfrgId := valueOr(attr(h1, "data-fragment-identifier"), attr(h1, "id"))
dfrgId := attr(h1, "data-fragment-identifier")
heading := &contenttree.Heading{
Type: contenttree.HeadingType,
Level: "chapter",
Expand All @@ -75,7 +75,7 @@ var defaultTransformers = map[string]transformer{
return heading
},
"h2": func(h2 *etree.Element) contenttree.Node {
dfrgId := valueOr(attr(h2, "data-fragment-identifier"), attr(h2, "id"))
dfrgId := attr(h2, "data-fragment-identifier")
return &contenttree.Heading{
Type: contenttree.HeadingType,
Level: "subheading",
Expand All @@ -84,7 +84,7 @@ var defaultTransformers = map[string]transformer{
}
},
"h3": func(h3 *etree.Element) contenttree.Node {
dfrgId := valueOr(attr(h3, "data-fragment-identifier"), attr(h3, "id"))
dfrgId := attr(h3, "data-fragment-identifier")
return &contenttree.Heading{
Type: contenttree.HeadingType,
Level: "subheading",
Expand All @@ -93,7 +93,7 @@ var defaultTransformers = map[string]transformer{
}
},
"h4": func(h4 *etree.Element) contenttree.Node {
dfrgId := valueOr(attr(h4, "data-fragment-identifier"), attr(h4, "id"))
dfrgId := attr(h4, "data-fragment-identifier")
return &contenttree.Heading{
Type: contenttree.HeadingType,
Level: "label",
Expand Down Expand Up @@ -232,33 +232,27 @@ var defaultTransformers = map[string]transformer{
Caption: attr(img, "longdesc"),
}
},

contentType.ImageSet: func(content *etree.Element) contenttree.Node {
dfrgId := valueOr(attr(content, "data-fragment-identifier"), attr(content, "id"))
dfrgId := attr(content, "data-fragment-identifier")
return &contenttree.ImageSet{
Type: contenttree.ImageSetType,
ID: attr(content, "url"),
ID: attr(content, "id"),
FragmentIdentifier: dfrgId,
}
},
contentType.Video: func(content *etree.Element) contenttree.Node {
return &contenttree.Video{
Type: contenttree.VideoType,
ID: attr(content, "url"),
ID: attr(content, "id"),
}
},
contentType.Content: func(content *etree.Element) contenttree.Node {
id := attr(content, "url")
parts := strings.Split(id, "/")
uuid := ""
if len(parts) > 0 {
uuid = parts[len(parts)-1]
}
dfrgId := valueOr(attr(content, "data-fragment-identifier"), attr(content, "id"))
id := attr(content, "id")
if attr(content, "data-asset-type") == "flourish" {
dfrgId := valueOr(attr(content, "data-fragment-identifier"), id)
return &contenttree.Flourish{
Type: contenttree.FlourishType,
Id: uuid,
Id: id,
FlourishType: attr(content, "data-flourish-type"),
LayoutWidth: string(toValidLayoutWidth(attr(content, "data-layout-width"))),
Description: attr(content, "alt"),
Expand All @@ -268,48 +262,30 @@ var defaultTransformers = map[string]transformer{
}
return &contenttree.Link{
Type: contenttree.LinkType,
URL: "https://www.ft.com/content/" + uuid,
URL: "https://www.ft.com/content/" + id,
Title: attr(content, "dataTitle"),
Children: []*contenttree.Phrasing{},
}
},
contentType.Article: func(content *etree.Element) contenttree.Node {
id := attr(content, "url")
parts := strings.Split(id, "/")
uuid := ""
if len(parts) > 0 {
uuid = parts[len(parts)-1]
}
return &contenttree.Link{
Type: contenttree.LinkType,
URL: "https://www.ft.com/content/" + uuid,
URL: "https://www.ft.com/content/" + attr(content, "id"),
Title: attr(content, "dataTitle"),
Children: []*contenttree.Phrasing{},
}
},
contentType.CustomCodeComponent: func(content *etree.Element) contenttree.Node {
id := attr(content, "url")
parts := strings.Split(id, "/")
uuid := ""
if len(parts) > 0 {
uuid = parts[len(parts)-1]
}
return &contenttree.CustomCodeComponent{
Type: contenttree.CustomCodeComponentType,
ID: uuid,
ID: attr(content, "id"),
LayoutWidth: string(toValidLayoutWidth(attr(content, "data-layout-width"))),
}
},
contentType.ClipSet: func(content *etree.Element) contenttree.Node {
id := attr(content, "url")
parts := strings.Split(id, "/")
uuid := ""
if len(parts) > 0 {
uuid = parts[len(parts)-1]
}
return &contenttree.ClipSet{
Type: contenttree.ClipSetType,
ID: uuid,
ID: attr(content, "id"),
LayoutWidth: string(toValidClipLayoutWidth(attr(content, "data-layout-width"))),
Autoplay: attr(content, "autoplay") == "true",
Loop: attr(content, "loop") == "true",
Expand All @@ -320,10 +296,7 @@ var defaultTransformers = map[string]transformer{
id := ""
teaser := ""
if link := findChild(rl, "content"); link != nil {
id = generateUrl(attr(link, "type"), attr(link, "id"))
teaser = textContent(link)
} else if link := findChild(rl, "ft-content"); link != nil {
id = attr(link, "url")
id = attr(link, "id")
teaser = textContent(link)
}
heading := findChild(rl, "recommended-title")
Expand Down
31 changes: 9 additions & 22 deletions libraries/from-bodyxml/go/transform.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,8 @@ func convertToContentTree(elem etree.Token, m contenttree.Node) error {
return nil
}

if t.Tag == "content" || t.Tag == "related" || t.Tag == "concept" {
id := attr(t, "id")
typeAttr := attr(t, "type")
if id != "" {
t.CreateAttr("url", generateUrl(typeAttr, id))
if attr(t, "data-asset-type") != "flourish" {
t.RemoveAttr("id")
}
}
}

tag := t.Tag
if t.Tag == "content" || t.Tag == "ft-content" {
if t.Tag == "content" {
for _, attr := range t.Attr {
if attr.Key == "type" {
tag = attr.Value
Expand All @@ -78,13 +67,14 @@ func convertToContentTree(elem etree.Token, m contenttree.Node) error {

transformer, ok := defaultTransformers[tag]
if !ok {
return fmt.Errorf("unknownNode transformer for tag <%s>", t.Tag)
//skip unknown tags
return nil
}

switch transformed := transformer(t).(type) {
case *unknownNode:
{
return fmt.Errorf("unknownNode div node with class '%s'", transformed.Class)
//skip unknown div
return nil
}
case *liftChildrenNode:
{
Expand All @@ -100,12 +90,8 @@ func convertToContentTree(elem etree.Token, m contenttree.Node) error {
{
err := m.AppendChild(transformed)
if err != nil {
return fmt.Errorf(
"failed to append transformed child of type <%s> for parent <%s>: %w",
transformed.GetType(),
m.GetType(),
err,
)
//skip invalid child nodes
return nil
}
if transformed.GetChildren() != nil {
for _, child := range t.Child {
Expand All @@ -127,7 +113,8 @@ func convertToContentTree(elem etree.Token, m contenttree.Node) error {
}
err := m.AppendChild(tx)
if err != nil {
return err
//skip invalid nodes
return nil
}
}
return nil
Expand Down
1 change: 0 additions & 1 deletion libraries/from-bodyxml/go/transform_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ func TestTransform(t *testing.T) {
for _, test := range getTestCases(t) {
t.Run(test.name, func(t *testing.T) {
bodyTree, err := Transform(test.input)

if err != nil && !test.wantErr {
t.Errorf("Failed with unexpected error: %v", err)
}
Expand Down
Loading
Loading