11/*
2- * Copyright 2020 Fabian Steeg, hbz
2+ * Copyright 2020, 2021 Fabian Steeg, hbz
33 *
44 * Licensed under the Apache License, Version 2.0 the "License";
55 * you may not use this file except in compliance with the License.
1717
1818import java .io .IOException ;
1919import java .io .Reader ;
20+ import java .io .UnsupportedEncodingException ;
21+ import java .net .URLDecoder ;
22+ import java .nio .charset .StandardCharsets ;
23+ import java .util .HashMap ;
24+ import java .util .Map ;
2025import java .util .UUID ;
2126
2227import org .apache .commons .io .IOUtils ;
3843 * @author Fabian Steeg (fsteeg)
3944 *
4045 */
41- @ Description ("Decode HTML to metadata events" )
46+ @ Description ("Decode HTML to metadata events. The attrValsAsSubfields option can be used to override "
47+ + "the default attribute values to be used as subfields (e.g. by default "
48+ + "`link rel=\" canonical\" href=\" http://example.org\" ` becomes `link.canonical`). "
49+ + "It expects an HTTP-style query string specifying as key the attributes whose value should "
50+ + "be used as a subfield, and as value the attribute whose value should be the subfield value, "
51+ + "e.g. the default contains `link.rel=href`. To use the HTML element text as the value "
52+ + "(instead of another attribute), omit the value of the query-string key-value pair, "
53+ + "e.g. `title.lang`. To add to the defaults, instead of replacing them, start with an `&`, "
54+ + "e.g. `&h3.class`" )
4255@ In (Reader .class )
4356@ Out (StreamReceiver .class )
4457@ FluxCommand ("decode-html" )
4558public class HtmlDecoder extends DefaultObjectPipe <Reader , StreamReceiver > {
4659
60+ private static final String DEFAULT_ATTR_VALS_AS_SUBFIELDS = //
61+ "meta.name=content&meta.property=content&link.rel=href&a.rel=href" ;
62+ private Map <String , String > attrValsAsSubfields ;
63+
64+ public HtmlDecoder () {
65+ setAttrValsAsSubfields (DEFAULT_ATTR_VALS_AS_SUBFIELDS );
66+ }
67+
4768 @ Override
4869 public void process (final Reader reader ) {
4970 try {
@@ -62,6 +83,7 @@ private void process(Element parent, StreamReceiver receiver) {
6283 receiver .startEntity (element .nodeName ());
6384 Attributes attributes = element .attributes ();
6485 for (Attribute attribute : attributes ) {
86+ handleAttributeValuesAsSubfields (receiver , element , attributes , attribute );
6587 receiver .literal (attribute .getKey (), attribute .getValue ());
6688 }
6789 if (element .children ().isEmpty ()) {
@@ -75,4 +97,35 @@ private void process(Element parent, StreamReceiver receiver) {
7597 receiver .endEntity ();
7698 }
7799 }
100+
101+ private void handleAttributeValuesAsSubfields (StreamReceiver receiver , Element element ,
102+ Attributes attributes , Attribute attribute ) {
103+ String fullFieldKey = element .nodeName () + "." + attribute .getKey ();
104+ if (attrValsAsSubfields .containsKey (fullFieldKey )) {
105+ String configValue = attrValsAsSubfields .get (fullFieldKey );
106+ if (configValue .trim ().isEmpty ()) {
107+ receiver .literal (attribute .getValue (), element .text ().trim ());
108+ } else {
109+ String value = attributes .get (configValue );
110+ receiver .literal (attribute .getValue (), value );
111+ }
112+ }
113+ }
114+
115+ public void setAttrValsAsSubfields (String mapString ) {
116+ this .attrValsAsSubfields = new HashMap <String , String >();
117+ String input = mapString .startsWith ("&" ) ? DEFAULT_ATTR_VALS_AS_SUBFIELDS + mapString
118+ : mapString ;
119+ for (String nameValuePair : input .split ("&" )) {
120+ String [] nameValue = nameValuePair .split ("=" );
121+ try {
122+ String utf8 = StandardCharsets .UTF_8 .name ();
123+ String key = URLDecoder .decode (nameValue [0 ], utf8 );
124+ String val = nameValue .length > 1 ? URLDecoder .decode (nameValue [1 ], utf8 ) : "" ;
125+ attrValsAsSubfields .put (key , val );
126+ } catch (UnsupportedEncodingException e ) {
127+ e .printStackTrace ();
128+ }
129+ }
130+ }
78131}
0 commit comments