11/*
2- * Copyright 2020 Fabian Steeg, hbz
2+ * Copyright 2020, 2021 Fabian Steeg, hbz
33 *
44 * Licensed under the Apache License, Version 2.0 the "License";
55 * you may not use this file except in compliance with the License.
1717
1818import java .io .IOException ;
1919import java .io .Reader ;
20+ import java .io .UnsupportedEncodingException ;
21+ import java .net .URLDecoder ;
22+ import java .nio .charset .StandardCharsets ;
23+ import java .util .HashMap ;
24+ import java .util .Map ;
2025import java .util .UUID ;
2126
2227import org .apache .commons .io .IOUtils ;
3843 * @author Fabian Steeg (fsteeg)
3944 *
4045 */
41- @ Description ("Decode HTML to metadata events" )
46+ @ Description ("Decode HTML to metadata events. The attrValsAsSubfields option can be used to override "
47+ + "the default attribute values to be used as subfields (e.g. by default "
48+ + "`link rel=\" canonical\" href=\" http://example.org\" ` becomes `link.canonical`). "
49+ + "It expects an HTTP-style query string specifying as key the attributes whose value should "
50+ + "be used as a subfield, and as value the attribute whose value should be the subfield value, "
51+ + "e.g. the default contains `link.rel=href`. To use the HTML element text as the value "
52+ + "(instead of another attribute), omit the value of the query-string key-value pair, "
53+ + "e.g. `title.lang`. To add to the defaults, instead of replacing them, start with an `&`, "
54+ + "e.g. `&h3.class`" )
4255@ In (Reader .class )
4356@ Out (StreamReceiver .class )
4457@ FluxCommand ("decode-html" )
4558public class HtmlDecoder extends DefaultObjectPipe <Reader , StreamReceiver > {
4659
60+ private static final String DEFAULT_ATTR_VALS_AS_SUBFIELDS = //
61+ "meta.name=content&meta.property=content&link.rel=href&a.rel=href" ;
62+ private Map <String , String > attrValsAsSubfields ;
63+
64+ public HtmlDecoder () {
65+ setAttrValsAsSubfields (DEFAULT_ATTR_VALS_AS_SUBFIELDS );
66+ }
67+
4768 @ Override
4869 public void process (final Reader reader ) {
4970 try {
@@ -61,18 +82,54 @@ private void process(Element parent, StreamReceiver receiver) {
6182 for (Element element : parent .children ()) {
6283 receiver .startEntity (element .nodeName ());
6384 Attributes attributes = element .attributes ();
85+ boolean addedValueAsSubfield = false ;
6486 for (Attribute attribute : attributes ) {
87+ addedValueAsSubfield = handleAttributeValuesAsSubfields (receiver , element , attributes , attribute );
6588 receiver .literal (attribute .getKey (), attribute .getValue ());
6689 }
6790 if (element .children ().isEmpty ()) {
6891 String text = element .text ().trim ();
6992 String value = text .isEmpty () ? element .data () : text ;
70- if (!value .isEmpty ()) {
93+ if (!value .isEmpty () && ! addedValueAsSubfield ) {
7194 receiver .literal ("value" , value );
7295 }
7396 }
7497 process (element , receiver );
7598 receiver .endEntity ();
7699 }
77100 }
101+
102+ private boolean handleAttributeValuesAsSubfields (StreamReceiver receiver , Element element ,
103+ Attributes attributes , Attribute attribute ) {
104+ String fullFieldKey = element .nodeName () + "." + attribute .getKey ();
105+ if (attrValsAsSubfields .containsKey (fullFieldKey )) {
106+ String configValue = attrValsAsSubfields .get (fullFieldKey );
107+ if (configValue .trim ().isEmpty ()) {
108+ receiver .literal (attribute .getValue (), element .text ().trim ());
109+ return true ;
110+ } else {
111+ String value = attributes .get (configValue );
112+ receiver .literal (attribute .getValue (), value );
113+ }
114+ }
115+ return false ;
116+ }
117+
118+ public void setAttrValsAsSubfields (String mapString ) {
119+ this .attrValsAsSubfields = new HashMap <String , String >();
120+ String input = mapString .startsWith ("&" ) ? DEFAULT_ATTR_VALS_AS_SUBFIELDS + mapString
121+ : mapString ;
122+ for (String nameValuePair : input .split ("&" )) {
123+ String [] nameValue = nameValuePair .split ("=" );
124+ try {
125+ String utf8 = StandardCharsets .UTF_8 .name ();
126+ String key = URLDecoder .decode (nameValue [0 ], utf8 );
127+ String val = nameValue .length > 1 ? URLDecoder .decode (nameValue [1 ], utf8 ) : "" ;
128+ attrValsAsSubfields .put (key , val );
129+ } catch (UnsupportedEncodingException e ) {
130+ e .printStackTrace ();
131+ }
132+ }
133+ }
134+
78135}
0 commit comments