From 219278da538b7973f469e87c0a15165a73a8f522 Mon Sep 17 00:00:00 2001
From: "malte.vogl" <mvogl@mpiwg-berlin.mpg.de>
Date: Fri, 24 Sep 2021 07:42:45 +0000
Subject: [PATCH 01/53] Initial commit

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 README.md

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..df7a75b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,3 @@
+# SemanticLayerTools
+
+Collects tools to create semantic layers in the socio-epistemic networks framework. Source material can be any structured corpus with metadata of authors, time, and at least one text column.
\ No newline at end of file

From 53173d65d4682142b5a034cba778bebf9f251811 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Fri, 24 Sep 2021 14:55:46 +0200
Subject: [PATCH 02/53] init semantic layer tools package

---
 .gitignore                                  |   4 +
 LICENSE                                     |  21 ++++
 docs/Makefile                               |  20 ++++
 docs/conf.py                                |  55 +++++++++++
 docs/index.rst                              |  22 +++++
 docs/make.bat                               |  35 +++++++
 pyproject.toml                              |   6 ++
 setup.cfg                                   |  29 ++++++
 src/semanticlayertools/__init__.py          |   0
 src/semanticlayertools/cleaning/__init__.py |   0
 src/semanticlayertools/linkage/__init__.py  |   0
 src/semanticlayertools/linkage/wordscore.py | 104 ++++++++++++++++++++
 12 files changed, 296 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 docs/Makefile
 create mode 100644 docs/conf.py
 create mode 100644 docs/index.rst
 create mode 100644 docs/make.bat
 create mode 100644 pyproject.toml
 create mode 100644 setup.cfg
 create mode 100644 src/semanticlayertools/__init__.py
 create mode 100644 src/semanticlayertools/cleaning/__init__.py
 create mode 100644 src/semanticlayertools/linkage/__init__.py
 create mode 100644 src/semanticlayertools/linkage/wordscore.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ba732dc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+**build
+**dist
+*env
+**.egg-info
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..b2f6552
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Malte Vogl (ModelSEN project)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d4bb2cb
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..372d312
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,55 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'SemanticLayerTools'
+copyright = '2021, Malte Vogl'
+author = 'Malte Vogl'
+
+# The full version, including alpha/beta/rc tags
+release = '0.0.1'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..e57bcff
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,22 @@
+.. SemanticLayerTools documentation master file, created by
+   sphinx-quickstart on Fri Sep 24 14:43:14 2021.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to SemanticLayerTools's documentation!
+==============================================
+
+This project collects tools to build semantic layers from text corpora.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..8084272
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..374b58c
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel"
+]
+build-backend = "setuptools.build_meta"
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..2113f09
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,29 @@
+[metadata]
+name = semanticlayertools
+version = 0.0.1
+author = Malte Vogl
+author_email = mvogl@mpiwg-berlin.mpg.de
+description = Create semantic layers using different methods for word linking.
+long_description = file: README.md
+long_description_content_type = text/markdown
+url = https://gitlab.gwdg.de/modelsen/semanticlayertools
+project_urls =
+    Project Home = https://modelsen.mpiwg-berlin.mpg.de
+    Bug Tracker = https://gitlab.gwdg.de/modelsen/semanticlayertools/-/issues
+classifiers =
+    Programming Language :: Python :: 3
+    License :: OSI Approved :: MIT License
+    Operating System :: OS Independent
+
+[options]
+package_dir =
+    = src
+packages = find:
+python_requires = >=3.6
+install_requires =
+  tqdm
+  nltk
+  numpy
+
+[options.packages.find]
+where = src
diff --git a/src/semanticlayertools/__init__.py b/src/semanticlayertools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/semanticlayertools/cleaning/__init__.py b/src/semanticlayertools/cleaning/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/semanticlayertools/linkage/__init__.py b/src/semanticlayertools/linkage/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py
new file mode 100644
index 0000000..7ce5d67
--- /dev/null
+++ b/src/semanticlayertools/linkage/wordscore.py
@@ -0,0 +1,104 @@
+import os
+import re
+from collections import Counter, defaultdict
+
+from tqdm import tqdm
+import numpy as np
+import nltk
+
+try:
+    nltk.pos_tag(nltk.word_tokenize('This is a test sentence.'))
+except LookupError:
+    print('Installing nltk perceptron tagger.')
+    nltk.download('averaged_perceptron_tagger')
+
+
+class CalculateScores(object):
+    """Calculates ngram scores for documents.
+
+    Considered parts of speech are (see NLTK docs for details)
+        - Nouns: 'NN', 'NNS', 'NNP', 'NNPS'
+        - Adjectives: 'JJ', 'JJR', 'JJS'
+    """
+
+    def __init__(self, sourceDataframe, textCol="text", pubIDCol="pubID", ngramsize=5,):
+
+        self.baseDF = sourceDataframe
+        self.textCol = textCol
+        self.pubIDCol = pubIDCol
+        self.ngramEnd = ngramsize
+        self.outputDict = {}
+        self.allNGrams = []
+        self.counts = {}
+        self.allgramslist = []
+        self.uniqueNGrams = ()
+
+    def getTermPatterns(self):
+        """Create dictionaries of occuring ngrams."""
+        allNGrams = {x: [] for x in range(1, self.ngramEnd + 1, 1)}
+        pos_tag = ["NN", "NNS", "NNP", "NNPS", "JJ", "JJR", "JJS"]
+        for _, row in tqdm(self.baseDF.iterrows()):
+            tokens = nltk.word_tokenize(row[self.textCol])
+            pos = nltk.pos_tag(tokens)
+            nnJJtokens = [x[0].lower() for x in pos if x[1] in pos_tag]
+            tempNGram = []
+            for i in range(1, self.ngramEnd + 1, 1):
+                val = allNGrams[i]
+                newngrams = list(nltk.ngrams(nnJJtokens, i))
+                val.extend(newngrams)
+                tempNGram.extend(newngrams)
+                allNGrams.update({i: val})
+            self.outputDict[row[self.pubIDCol]] = tempNGram
+        self.allNGrams = allNGrams
+        allgrams = [x for y in [y for x, y in self.allNGrams.items()] for x in y]
+        self.allgramslist = allgrams
+        self.counts = Counter(allgrams)
+        self.uniqueNGrams = set(allgrams)
+
+    def getScore(self, target):
+        """Calculate ngram score."""
+        meta = {
+            "target": target,
+            "counts": self.counts[target],
+            "corpusL": len(self.allgramslist),
+            "maxL": len(target),
+        }
+
+        res = defaultdict(list())
+
+        for idx, subgram in enumerate(target):
+            key = idx + 1
+            for tup in self.allNGrams[2]:
+                if tup[1:][0] == subgram:
+                    res[f"l_{key}"].append(tup[:1][0])
+                elif tup[:-1][0] == subgram:
+                    res[f"r_{key}"].append(tup[1:][0])
+        valueList = []
+        for L in range(1, meta["maxL"] + 1, 1):
+            leftkey = f"l_{L}"
+            rightkey = f"r_{L}"
+            if rightkey not in res.keys():
+                rvalue = 0
+            else:
+                rvalue = len(list(set(res[rightkey])))
+            if leftkey not in res.keys():
+                lvalue = 0
+            else:
+                lvalue = len(list(set(res[leftkey])))
+            valueList.append((lvalue + 1) * (rvalue + 1))
+        return {
+            target: meta["counts"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"]))
+        }
+
+    def run(self):
+        """Get score for all documents."""
+        scores = {}
+        self.getTermPatterns()
+        for target in tqdm(self.uniqueNGrams):
+            scores.update(self.getScore(target))
+        for key, val in self.outputDict.items():
+            tmpList = []
+            for elem in val:
+                tmpList.append([elem, scores[elem]])
+            self.outputDict.update({key: tmpList})
+        return scores, self.outputDict

From 258d8399c94f199ee8a0e58c21df5fce0eb217a5 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 27 Sep 2021 16:02:58 +0200
Subject: [PATCH 03/53] wip continue word scoring linkage

---
 src/semanticlayertools/cleaning/clean.py    |   0
 src/semanticlayertools/linkage/wordscore.py | 184 +++++++++++++++++++-
 2 files changed, 178 insertions(+), 6 deletions(-)
 create mode 100644 src/semanticlayertools/cleaning/clean.py

diff --git a/src/semanticlayertools/cleaning/clean.py b/src/semanticlayertools/cleaning/clean.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py
index 7ce5d67..b7239e4 100644
--- a/src/semanticlayertools/linkage/wordscore.py
+++ b/src/semanticlayertools/linkage/wordscore.py
@@ -1,9 +1,12 @@
 import os
 import re
 from collections import Counter, defaultdict
+from itertools import islice, combinations
+
 
 from tqdm import tqdm
 import numpy as np
+import pandas as pd
 import nltk
 
 try:
@@ -13,7 +16,7 @@
     nltk.download('averaged_perceptron_tagger')
 
 
-class CalculateScores(object):
+class CalculateScores():
     """Calculates ngram scores for documents.
 
     Considered parts of speech are (see NLTK docs for details)
@@ -21,11 +24,12 @@ class CalculateScores(object):
         - Adjectives: 'JJ', 'JJR', 'JJS'
     """
 
-    def __init__(self, sourceDataframe, textCol="text", pubIDCol="pubID", ngramsize=5,):
+    def __init__(self, sourceDataframe, textColumn="text", pubIDColumn="pubID", yearColumn='year',  ngramsize=5,):
 
         self.baseDF = sourceDataframe
-        self.textCol = textCol
-        self.pubIDCol = pubIDCol
+        self.textCol = textColumn
+        self.pubIDCol = pubIDColumn
+        self.yearCol = yearColumn
         self.ngramEnd = ngramsize
         self.outputDict = {}
         self.allNGrams = []
@@ -87,10 +91,10 @@ def getScore(self, target):
                 lvalue = len(list(set(res[leftkey])))
             valueList.append((lvalue + 1) * (rvalue + 1))
         return {
-            target: meta["counts"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"]))
+            target: meta["counts"]/meta["corpusL"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"]))
         }
 
-    def run(self):
+    def run(self, write=False, outpath='./'):
         """Get score for all documents."""
         scores = {}
         self.getTermPatterns()
@@ -101,4 +105,172 @@ def run(self):
             for elem in val:
                 tmpList.append([elem, scores[elem]])
             self.outputDict.update({key: tmpList})
+        if write is True:
+            for year, df in self.baseDF.groupby(self.yearCol):
+                with open(f'{outpath}{str(year)}.csv', 'a') as yearfile:
+                    for pub in df[self.pubIDCol].unique():
+                        for elem in self.outputDict[pub]:
+                            yearfile.write(f'{pub},{elem[0]},{elem[1]}')
         return scores, self.outputDict
+
+
+class LinksOverTime():
+    """To keep track of nodes over time, we need a global register of node names.
+
+    Input:
+    """
+
+    def __init__(self, outputPath, scorePath, dataframe, scoreLimit=1.0, debug=False, windowSize=1):
+        self.dataframe = dataframe
+        self.authorCol = 'author'
+        self.pubIDCol = 'pubIDelm'
+        self.scoreLimit = scoreLimit
+        self.outpath = outputPath
+        self.scorepath = scorePath
+        self.nodeMap = {}
+        self.debug = debug
+        self.windowSize = windowSize
+
+    def _window(self, seq):
+        """Return a sliding window (of width n) over data from the iterable.
+
+        s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
+        """
+        n = self.windowSize
+        it = iter(seq)
+        result = tuple(islice(it, n))
+        if len(result) == n:
+            yield result
+        for elem in it:
+            result = result[1:] + (elem,)
+            yield result
+
+    def _createSlices(self):
+        slices = []
+        years = sorted(self.dataframe.year.unique())
+        for x in self._window(years):
+            slices.append(x)
+        return slices
+
+    def createNodeRegister(self, sl):
+        """Create multilayer node register for time slice."""
+        if self.debug is True:
+            print(f'Slice: {sl[0]}')
+        dataframe = self.dataframe[self.dataframe.year.isin(sl)]
+        dfNgramsList = [pd.read_csv(
+            self.scorepath + str(slN) + '.tsv',
+            sep='\t',
+            header=None
+        ) for slN in sl]
+        ngramdataframe = pd.concat(dfNgramsList)
+        ngramdataframe = ngramdataframe[ngramdataframe[2] > self.scoreLimit]
+
+        authorList = [x for y in dataframe[self.authorCol].values for x in y]
+
+        authors = [x for x in set(authorList) if x]
+        pubs = dataframe[self.pubIDCol].fillna('None').unique()
+        ngrams = ngramdataframe[1].unique()
+
+        for authorval in authors:
+            if not self.nodeMap.values():
+                self.nodeMap.update({authorval: 1})
+            else:
+                if authorval not in self.nodeMap.keys():
+                    self.nodeMap.update(
+                        {authorval: max(self.nodeMap.values()) + 1}
+                    )
+        for pubval in list(pubs):
+            if pubval not in self.nodeMap.keys():
+                self.nodeMap.update({pubval: max(self.nodeMap.values()) + 1})
+        for ngramval in list(ngrams):
+            if ngramval not in self.nodeMap.keys():
+                self.nodeMap.update({ngramval: max(self.nodeMap.values()) + 1})
+
+        if self.debug is True:
+            print(
+                '\tNumber of vertices (authors, papers and ngrams) {0}'.format(
+                    max(self.nodeMap.values())
+                )
+            )
+
+    def writeLinks(self, sl, recreate=False):
+        """Write links to file."""
+        dataframe = self.dataframe[self.dataframe.year.isin(sl)]
+        filePath = self.outpath + 'multilayerPajek_{0}.net'.format(sl[0])
+
+        if os.path.isfile(filePath):
+            if recreate is False:
+                raise IOError(
+                    f'File at {filePath} exists. Set recreate = True to rewrite file.'
+                    )
+            if recreate is True:
+                os.remove(filePath)
+
+        dfNgramsList = [pd.read_csv(
+            self.scorepath + str(slN) + '.tsv',
+            sep='\t',
+            header=None
+        ) for slN in sl]
+        ngramdataframe = pd.concat(dfNgramsList)
+        ngramdataframe = ngramdataframe[ngramdataframe[2] > self.scoreLimit]
+
+        with open(filePath, 'a') as file:
+            file.write("# A network in a general multiplex format\n")
+            file.write("*Vertices {0}\n".format(max(self.nodeMap.values())))
+            for x, y in self.nodeMap.items():
+                tmpStr = '{0} "{1}"\n'.format(y, x)
+                if tmpStr:
+                    file.write(tmpStr)
+            file.write("*Multiplex\n")
+            file.write("# layer node layer node [weight]\n")
+            if self.debug is True:
+                print('\tWriting inter-layer links to file.')
+            for _, row in dataframe.fillna('').iterrows():
+                authors = row[self.authorCol]
+                paper = row[self.pubIDCol]
+                if paper not in self.nodeMap.keys():
+                    print(f'Cannot find {paper}')
+                ngramsList = ngramdataframe[ngramdataframe[0] == paper]
+                paperNr = self.nodeMap[paper]
+                if len(authors) >= 2:
+                    # pairs = [x for x in combinations(authors, 2)]
+                    for pair in combinations(authors, 2):  # pairs:
+                        file.write('{0} {1} {2} {3} 1\n'.format(
+                            1,
+                            self.nodeMap[pair[0]],
+                            1,
+                            self.nodeMap[pair[1]]
+                            )
+                        )
+                for author in authors:
+                    try:
+                        authNr = self.nodeMap[author]
+                        file.write('{0} {1} {2} {3} 1\n'.format(
+                            1,
+                            authNr,
+                            2,
+                            paperNr
+                            )
+                        )
+                    except KeyError:
+                        pass
+                for _, ngramrow in ngramsList.iterrows():
+                    try:
+                        ngramNr = self.nodeMap[ngramrow[1]]
+                        weight = ngramrow[2]
+                        file.write('{0} {1} {2} {3} {4}\n'.format(
+                            2,
+                            paperNr,
+                            3,
+                            ngramNr,
+                            weight
+                            )
+                        )
+                    except KeyError:
+                        pass
+
+    def run(self, recreate=False):
+        """Create all data for slices."""
+        for sl in tqdm(self._createSlices()):
+            self.createNodeRegister(sl)
+            self.writeLinks(sl, recreate=recreate)

From 1f05a8e3c335f239ea06941f9626072c44228a77 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Tue, 28 Sep 2021 15:03:41 +0200
Subject: [PATCH 04/53] wip add tests and docs with tox

---
 .gitignore                                    |  3 ++
 docs/cleaning.rst                             |  6 ++++
 docs/conf.py                                  |  6 +++-
 docs/index.rst                                |  3 ++
 docs/linkage.rst                              |  6 ++++
 requirements.txt                              |  5 +++
 requirements_dev.txt                          |  1 +
 setup.cfg                                     |  4 ++-
 src/semanticlayertools/cleaning/text.py       | 36 +++++++++++++++++++
 src/semanticlayertools/linkage/wordscore.py   | 18 +++++-----
 .../cleaning/clean.py => tests/__init__.py    |  0
 tests/cleaning/__init__.py                    |  0
 tests/cleaning/test_textcleaning.py           | 15 ++++++++
 tests/linkage/__init__.py                     |  0
 tests/linkage/test_wordscore.py               | 19 ++++++++++
 tests/testdata/testdata.json                  |  1 +
 tox.ini                                       | 23 ++++++++++++
 17 files changed, 135 insertions(+), 11 deletions(-)
 create mode 100644 docs/cleaning.rst
 create mode 100644 docs/linkage.rst
 create mode 100644 requirements.txt
 create mode 100644 requirements_dev.txt
 create mode 100644 src/semanticlayertools/cleaning/text.py
 rename src/semanticlayertools/cleaning/clean.py => tests/__init__.py (100%)
 create mode 100644 tests/cleaning/__init__.py
 create mode 100644 tests/cleaning/test_textcleaning.py
 create mode 100644 tests/linkage/__init__.py
 create mode 100644 tests/linkage/test_wordscore.py
 create mode 100644 tests/testdata/testdata.json
 create mode 100644 tox.ini

diff --git a/.gitignore b/.gitignore
index ba732dc..48a75fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,7 @@
 **build
 **dist
 *env
+*testlab
 **.egg-info
+*.tox
+**__pycache__
diff --git a/docs/cleaning.rst b/docs/cleaning.rst
new file mode 100644
index 0000000..c54eeec
--- /dev/null
+++ b/docs/cleaning.rst
@@ -0,0 +1,6 @@
+Text and data cleaning
+======================
+
+.. automodule:: semanticlayertools.cleaning.text
+   :members:
+   :undoc-members:
diff --git a/docs/conf.py b/docs/conf.py
index 372d312..7ce9006 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -24,6 +24,8 @@
 # The full version, including alpha/beta/rc tags
 release = '0.0.1'
 
+master_doc = 'index'
+
 
 # -- General configuration ---------------------------------------------------
 
@@ -31,6 +33,8 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.intersphinx'
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -52,4 +56,4 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
\ No newline at end of file
+html_static_path = ['_static']
diff --git a/docs/index.rst b/docs/index.rst
index e57bcff..747681b 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -12,6 +12,9 @@ This project collects tools to build semantic layers from text corpora.
    :maxdepth: 2
    :caption: Contents:
 
+   linkage
+   cleaning
+
 
 
 Indices and tables
diff --git a/docs/linkage.rst b/docs/linkage.rst
new file mode 100644
index 0000000..441a5d1
--- /dev/null
+++ b/docs/linkage.rst
@@ -0,0 +1,6 @@
+Word scoring and linkage
+========================
+
+.. automodule:: semanticlayertools.linkage.wordscore
+   :members:
+   :undoc-members:
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..6d96cb6
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+tqdm
+nltk
+numpy
+spacy
+pandas
diff --git a/requirements_dev.txt b/requirements_dev.txt
new file mode 100644
index 0000000..053148f
--- /dev/null
+++ b/requirements_dev.txt
@@ -0,0 +1 @@
+tox
diff --git a/setup.cfg b/setup.cfg
index 2113f09..4f861e3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,11 +19,13 @@ classifiers =
 package_dir =
     = src
 packages = find:
-python_requires = >=3.6
+python_requires = >=3.7
 install_requires =
   tqdm
   nltk
   numpy
+  spacy
+  pandas
 
 [options.packages.find]
 where = src
diff --git a/src/semanticlayertools/cleaning/text.py b/src/semanticlayertools/cleaning/text.py
new file mode 100644
index 0000000..8d7f2cb
--- /dev/null
+++ b/src/semanticlayertools/cleaning/text.py
@@ -0,0 +1,36 @@
+import re
+import spacy
+
+try:
+    nlp = spacy.load("en_core_web_lg")
+except OSError:
+    pass
+
+
+
+def lemmaSpacy(text):
+    """Clean text in dataframe column."""
+    try:
+        if isinstance(text, list):
+            text = text[0]
+        doc = nlp(text)
+        tokens = ' '.join(
+            [t.lemma_ for t in doc if not t.is_stop and len(t) > 3]
+        )
+        return tokens.lower()
+    except:
+        return ''
+
+
+def htmlTags(text):
+    """Remove html tags in text."""
+    if isinstance(text, list):
+        text = text[0]
+    for tagPair in [
+        ('<SUB>', '_'),
+        ('</SUB>', ''),
+        ('<SUP>', '^'),
+        ('</SUP>', '')
+    ]:
+        text = re.sub(tagPair[0], tagPair[1], text)
+    return text
diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py
index b7239e4..e19a69e 100644
--- a/src/semanticlayertools/linkage/wordscore.py
+++ b/src/semanticlayertools/linkage/wordscore.py
@@ -3,7 +3,6 @@
 from collections import Counter, defaultdict
 from itertools import islice, combinations
 
-
 from tqdm import tqdm
 import numpy as np
 import pandas as pd
@@ -68,7 +67,7 @@ def getScore(self, target):
             "maxL": len(target),
         }
 
-        res = defaultdict(list())
+        res = defaultdict(list)
 
         for idx, subgram in enumerate(target):
             key = idx + 1
@@ -91,7 +90,7 @@ def getScore(self, target):
                 lvalue = len(list(set(res[leftkey])))
             valueList.append((lvalue + 1) * (rvalue + 1))
         return {
-            target: meta["counts"]/meta["corpusL"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"]))
+            target: 1/meta["counts"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"]))
         }
 
     def run(self, write=False, outpath='./'):
@@ -120,10 +119,11 @@ class LinksOverTime():
     Input:
     """
 
-    def __init__(self, outputPath, scorePath, dataframe, scoreLimit=1.0, debug=False, windowSize=1):
+    def __init__(self, outputPath, scorePath, dataframe, authorColumn='authors', pubIDColumn="pubID", yearColumn='year', scoreLimit=1.0, debug=False, windowSize=1):
         self.dataframe = dataframe
-        self.authorCol = 'author'
-        self.pubIDCol = 'pubIDelm'
+        self.authorCol = authorColumn
+        self.pubIDCol = pubIDColumn
+        self.yearColumn = yearColumn
         self.scoreLimit = scoreLimit
         self.outpath = outputPath
         self.scorepath = scorePath
@@ -147,7 +147,7 @@ def _window(self, seq):
 
     def _createSlices(self):
         slices = []
-        years = sorted(self.dataframe.year.unique())
+        years = sorted(self.dataframe[self.yearColumn].unique())
         for x in self._window(years):
             slices.append(x)
         return slices
@@ -156,7 +156,7 @@ def createNodeRegister(self, sl):
         """Create multilayer node register for time slice."""
         if self.debug is True:
             print(f'Slice: {sl[0]}')
-        dataframe = self.dataframe[self.dataframe.year.isin(sl)]
+        dataframe = self.dataframe[self.dataframe[self.yearColumn].isin(sl)]
         dfNgramsList = [pd.read_csv(
             self.scorepath + str(slN) + '.tsv',
             sep='\t',
@@ -195,7 +195,7 @@ def createNodeRegister(self, sl):
 
     def writeLinks(self, sl, recreate=False):
         """Write links to file."""
-        dataframe = self.dataframe[self.dataframe.year.isin(sl)]
+        dataframe = self.dataframe[self.dataframe[self.yearColumn].isin(sl)]
         filePath = self.outpath + 'multilayerPajek_{0}.net'.format(sl[0])
 
         if os.path.isfile(filePath):
diff --git a/src/semanticlayertools/cleaning/clean.py b/tests/__init__.py
similarity index 100%
rename from src/semanticlayertools/cleaning/clean.py
rename to tests/__init__.py
diff --git a/tests/cleaning/__init__.py b/tests/cleaning/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/cleaning/test_textcleaning.py b/tests/cleaning/test_textcleaning.py
new file mode 100644
index 0000000..02b228a
--- /dev/null
+++ b/tests/cleaning/test_textcleaning.py
@@ -0,0 +1,15 @@
+from semanticlayertools.cleaning.text import htmlTags, lemmaSpacy
+
+
+def test_htmlclean():
+    """Test removal of html tags."""
+    testtext = "This He<SUB>3</SUB> is really cool, super<SUP>2</SUP> cool!"
+    resultString = "This He_3 is really cool, super^2 cool!"
+    assert htmlTags(testtext) == resultString
+
+
+def test_lemmaSpacy():
+    """Test lemmatizing with Spacy."""
+    testtext = "In this paper we analyze the difficulties of gravity in rotating black holes."
+    resultString = "paper analyze difficulty gravity rotate black hole"
+    assert lemmaSpacy(testtext) == resultString
diff --git a/tests/linkage/__init__.py b/tests/linkage/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/linkage/test_wordscore.py b/tests/linkage/test_wordscore.py
new file mode 100644
index 0000000..f40a60e
--- /dev/null
+++ b/tests/linkage/test_wordscore.py
@@ -0,0 +1,19 @@
+import unittest
+import os
+import pandas as pd
+from semanticlayertools.linkage.wordscore import CalculateScores
+
+basePath = os.path.dirname(os.path.abspath(__file__ + "/../"))
+filePath = f'{basePath}/testdata/testdata.json'
+
+df = pd.read_json(filePath)
+
+class TestCalculateScores(unittest.TestCase):
+
+    def setUp(self):
+        self.scoreinit = CalculateScores(df, textColumn='clean', pubIDColumn='pubIDs')
+        self.scorePattern = self.scoreinit.getTermPatterns()
+        self.scoreOut = self.scoreinit.run()
+
+    def test_scoring(self):
+        self.assertLessEqual(self.scoreOut[0][('theory',)], 1)
diff --git a/tests/testdata/testdata.json b/tests/testdata/testdata.json
new file mode 100644
index 0000000..3f86d7d
--- /dev/null
+++ b/tests/testdata/testdata.json
@@ -0,0 +1 @@
+{"clean":{"0":"The adsorption of halogens on metal films\u2014I Adsorption measurements and surface potentials for chlorine on nickel","1":"Some Applications of Power Law Analysis to Radioisotope Studies in Man","2":"This investigation consists of two different but related parts. 1. The thermionic properties of rhenium and rhenium with an adsorbed layer of thorium atoms, were investigated. The Richardson constants for rhenium were found to be \u03c6=4.85 eV, A=66 A per cm^2-deg^2. The work function decreases with increasing thorium coverage \u03c3 to a minimum of 3.15 eV at \u03c3=4.2\u00d710^14 atoms per sq cm then rises to a constant value of 3.3 eV at \u03c3=8\u00d710^14 atoms per sq cm. A comparison with the system tungsten\u2014thorium shows that: (a) \u03c6_min occurs at the same surface density in both cases, (b) \u03c6_const is attained at the same surface density in both cases, and this density is that of the atoms in a (100) plane of thorium metal, and (c) the values of \u03c6_const are equal and different from that of bulk thorium by only 0.1 eV. On the basis of these regularities a model is proposed to explain the variation of \u03c6 with \u03c3. 2. The thermal desorption of the adsorbed atoms was studied at temperatures between 2203\u00b0 and 2468\u00b0K. At low coverage the desorption rate per atom \u03b7_0, was found to be constant with coverage, and to obey the relation \u03b7_0=CT exp (-\u220a\/kT) with C=4\u00d710^10 sec^-1 deg^-1 and \u220a=8.30 eV. The desorption rate increased for coverages between 0.05 and 0.5 monolayer and this increase agreed well with the predictions of a theory based on interactions among the adsorbed atoms.","3":"The Theory of Space, Time and Gravitation","4":"500-Kv-Line Design","5":"In this paper we have considered certain problems which arise when one attempts to cast a covariant field theory into a canonical form. Because of the invariance properties of the theory, certain identities exist between the canonical field variables. To insure that the canonical theory is equivalent to the underlying lagrangian formalism one must require that these identities, once satisfied, will remain satisfied through the course of time. In general, this will be true only if additional constraints are set between the canonical variables. We have shown that only a finite number of such constraints exist and that they form a function group. Our proof rests essentially on the possibility of constructing a generating function for an infinitesimal canonical transformation that is equivalent to an invariant infinitesimal transformation on the lagrangian formalism. Once a hamiltonian is obtained by one of the procedures outlined in previous papers of this series, and the constraints have all been found, the consistent, invariant canonical formulation of the theory is completed. The main results of the paper have been formulated in such a manner as to make them applicable to a fairly general type of invariance. In the last sections we have applied these results to the cases of gauge and coordinate invariance. In the latter case a hamiltonian, corresponding to a quadratic lagrangian, has been constructed in a parameter-free form; and in both cases the constraints, together with the poisson bracket relations between them, have been obtained explicitly. As was to be expected, two constraints were found for a gauge-invariant theory and eight for a coordinate-invariant theory.","6":"Absolute change in general relativity","7":"Quantum Restrictions on the Measurability of Fields in Gravitational Theory","8":"Dyson has shown that the evaluation of the S matrix for quantum electrodynamics can be reduced to the problem of evaluating certain quantities, S^'F, D^'F, and \u0393_\u03bc. By making use of a formula relating the T product of an operator with its corresponding N product, integro-differential equations for S^'F and D^'F are obtained. These equations are identical in form with those given by Schwinger for his Green's functions, and hence it is concluded that the two formalisms are equivalent. In addition it is shown that all of the quantities introduced by Schwinger can be expressed in terms of a single quantity, S_vac, the vacuum expectation value of the S matrix. The renormalization problem is not discussed.","9":"In this paper we analyze the difficulties which occur when one attempts to quantize a theory such as electrodynamics or the general theory of relativity. Because of the invariance properties of theories of this type all of the canonical variables of the theory are not independent of one another but rather there exists certain algebraic relations between them called constraints. These constraints plus the Hamiltonian, in the unquantized version of the theory, constitute a function group. It is proved that there exists at least one ordering of factors in the quantized theory for which this is also true. From this fact we conclude that it is possible, at least formally, to construct a quantum version of the theories under consideration and that the quantum version will possess the same invariance properties as the corresponding unquantized theory."},"pubIDs":{"0":"10.1016\/0022-3697(60)90159-1","1":"10.1088\/0031-9155\/8\/3\/305","2":"10.1063\/1.1702725","3":"10.1063\/1.3051237","4":"10.1109\/TPAS.1963.291452","5":"10.1103\/PhysRev.83.1018","6":"Absolute change in general relativity","7":"Quantum Restrictions on the Measurability of Fields in Gravitational Theory","8":"10.1103\/PhysRev.94.703","9":"10.1103\/PhysRev.99.1009"}}
\ No newline at end of file
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..5af3319
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,23 @@
+[tox]
+envlist = py37,py38
+isolated_build = True
+
+[pytest]
+minversion = 6.0
+addopts = -ra -q
+testpaths =
+    tests
+
+[testenv]
+deps =
+    pytest
+    -rrequirements.txt
+commands_pre = python -m spacy download en_core_web_sm
+commands = pytest {posargs}
+
+[testenv:docs]
+description = invoke sphinx-build to build the HTML docs
+basepython = python3.7
+deps = sphinx >= 1.7.5, < 2
+commands = sphinx-build -d "{toxworkdir}/docs_doctree" docs "{toxworkdir}/docs_out" --color -W -bhtml {posargs}
+           python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))'

From bd14d0dfd07719c2b8e845c242abb283886301ee Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Wed, 29 Sep 2021 13:54:59 +0200
Subject: [PATCH 05/53] wip set readthedocs theme, add testing, fix linkage

---
 docs/conf.py                                 |  2 +-
 src/semanticlayertools/cleaning/text.py      |  5 +-
 src/semanticlayertools/linkage/wordscore.py  | 87 +++++++++++++-------
 src/semanticlayertools/utils/__init__.py     |  0
 src/semanticlayertools/utils/wordscorenet.py |  1 +
 tox.ini                                      |  4 +-
 6 files changed, 65 insertions(+), 34 deletions(-)
 create mode 100644 src/semanticlayertools/utils/__init__.py
 create mode 100644 src/semanticlayertools/utils/wordscorenet.py

diff --git a/docs/conf.py b/docs/conf.py
index 7ce9006..8dbe953 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -51,7 +51,7 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'alabaster'
+html_theme = 'sphinx_rtd_theme'
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
diff --git a/src/semanticlayertools/cleaning/text.py b/src/semanticlayertools/cleaning/text.py
index 8d7f2cb..7f3889e 100644
--- a/src/semanticlayertools/cleaning/text.py
+++ b/src/semanticlayertools/cleaning/text.py
@@ -4,8 +4,7 @@
 try:
     nlp = spacy.load("en_core_web_lg")
 except OSError:
-    pass
-
+    nlp = spacy.load("en_core_web_sm")
 
 
 def lemmaSpacy(text):
@@ -19,7 +18,7 @@ def lemmaSpacy(text):
         )
         return tokens.lower()
     except:
-        return ''
+        raise
 
 
 def htmlTags(text):
diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py
index e19a69e..fee1dec 100644
--- a/src/semanticlayertools/linkage/wordscore.py
+++ b/src/semanticlayertools/linkage/wordscore.py
@@ -18,9 +18,27 @@
 class CalculateScores():
     """Calculates ngram scores for documents.
 
-    Considered parts of speech are (see NLTK docs for details)
+    Considered parts of speech are (see `nltk` docs for details)
         - Nouns: 'NN', 'NNS', 'NNP', 'NNPS'
         - Adjectives: 'JJ', 'JJR', 'JJS'
+
+    All texts of the corpus are tokenized and POS tags are generated.
+    A global dictionary of counts of different ngrams is build in `allNGrams`.
+    The ngram relations of every text are listed in `outputDict`.
+
+    Scoring counts occurance of different words left and right of each single
+    token in each ngram, weighted by ngram size.
+
+    :param sourceDataframe: Dataframe containing the basic corpus
+    :type sourceDataframe: class:`pandas.DataFrame`
+    :param textColumn: Column name to use for ngram calculation
+    :type textColumn: str
+    :param pubIDColumn: Column name to use for publication identification (assumend to be unique)
+    :type pubIDColumn: str
+    :param yearColumn: Column name for temporal ordering publications, used during writing the scoring files
+    :type yearColumn: str
+    :param ngramsize: Maximum of considered ngrams (default: 5-gram)
+    :type ngramsize: int
     """
 
     def __init__(self, sourceDataframe, textColumn="text", pubIDColumn="pubID", yearColumn='year',  ngramsize=5,):
@@ -106,37 +124,48 @@ def run(self, write=False, outpath='./'):
             self.outputDict.update({key: tmpList})
         if write is True:
             for year, df in self.baseDF.groupby(self.yearCol):
-                with open(f'{outpath}{str(year)}.csv', 'a') as yearfile:
+                with open(f'{outpath}{str(year)}.tsv', 'a') as yearfile:
                     for pub in df[self.pubIDCol].unique():
                         for elem in self.outputDict[pub]:
-                            yearfile.write(f'{pub},{elem[0]},{elem[1]}')
+                            yearfile.write(f'{pub}\t{elem[0]}\t{elem[1]}\n')
         return scores, self.outputDict
 
 
 class LinksOverTime():
-    """To keep track of nodes over time, we need a global register of node names.
+    """Create multilayer pajek files for corpus.
+
+    To keep track of nodes over time, we need a global register of node names.
+    This class takes care of this, by adding new keys of authors, papers or
+    ngrams to the register.
 
-    Input:
+    :param dataframe: Source dataframe containing metadata of texts
+    (authors, publicationID and year)
+    :type dataframe: class:`pandas.DataFrame`
+    :param authorColumn: Column name for author information
+    :param pubIDColumn: Column name to identify publications
+    :param yearColumn: Column name with year information
     """
 
-    def __init__(self, outputPath, scorePath, dataframe, authorColumn='authors', pubIDColumn="pubID", yearColumn='year', scoreLimit=1.0, debug=False, windowSize=1):
+    def __init__(
+        self,
+        dataframe,
+        authorColumn='authors',
+        pubIDColumn="pubID",
+        yearColumn='year',
+        debug=False
+    ):
         self.dataframe = dataframe
         self.authorCol = authorColumn
         self.pubIDCol = pubIDColumn
         self.yearColumn = yearColumn
-        self.scoreLimit = scoreLimit
-        self.outpath = outputPath
-        self.scorepath = scorePath
         self.nodeMap = {}
         self.debug = debug
-        self.windowSize = windowSize
 
-    def _window(self, seq):
+    def _window(self, seq, n):
         """Return a sliding window (of width n) over data from the iterable.
 
         s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
         """
-        n = self.windowSize
         it = iter(seq)
         result = tuple(islice(it, n))
         if len(result) == n:
@@ -145,27 +174,27 @@ def _window(self, seq):
             result = result[1:] + (elem,)
             yield result
 
-    def _createSlices(self):
+    def _createSlices(self, windowsize):
         slices = []
         years = sorted(self.dataframe[self.yearColumn].unique())
-        for x in self._window(years):
+        for x in self._window(years, windowsize):
             slices.append(x)
         return slices
 
-    def createNodeRegister(self, sl):
+    def createNodeRegister(self, sl, scorePath, scoreLimit):
         """Create multilayer node register for time slice."""
         if self.debug is True:
             print(f'Slice: {sl[0]}')
         dataframe = self.dataframe[self.dataframe[self.yearColumn].isin(sl)]
         dfNgramsList = [pd.read_csv(
-            self.scorepath + str(slN) + '.tsv',
+            scorePath + str(slN) + '.tsv',
             sep='\t',
             header=None
         ) for slN in sl]
         ngramdataframe = pd.concat(dfNgramsList)
-        ngramdataframe = ngramdataframe[ngramdataframe[2] > self.scoreLimit]
+        ngramdataframe = ngramdataframe[ngramdataframe[2] > scoreLimit]
 
-        authorList = [x for y in dataframe[self.authorCol].values for x in y]
+        authorList = [x for y in [x.split(';') for x in dataframe[self.authorCol].values] for x in y]
 
         authors = [x for x in set(authorList) if x]
         pubs = dataframe[self.pubIDCol].fillna('None').unique()
@@ -193,10 +222,10 @@ def createNodeRegister(self, sl):
                 )
             )
 
-    def writeLinks(self, sl, recreate=False):
-        """Write links to file."""
+    def writeLinks(self, sl, scorePath, scoreLimit, outpath='./', recreate=False):
+        """Write multilayer links to file in Pajek format."""
         dataframe = self.dataframe[self.dataframe[self.yearColumn].isin(sl)]
-        filePath = self.outpath + 'multilayerPajek_{0}.net'.format(sl[0])
+        filePath = outpath + 'multilayerPajek_{0}.net'.format(sl[0])
 
         if os.path.isfile(filePath):
             if recreate is False:
@@ -207,12 +236,12 @@ def writeLinks(self, sl, recreate=False):
                 os.remove(filePath)
 
         dfNgramsList = [pd.read_csv(
-            self.scorepath + str(slN) + '.tsv',
+            scorePath + str(slN) + '.tsv',
             sep='\t',
             header=None
         ) for slN in sl]
         ngramdataframe = pd.concat(dfNgramsList)
-        ngramdataframe = ngramdataframe[ngramdataframe[2] > self.scoreLimit]
+        ngramdataframe = ngramdataframe[ngramdataframe[2] > scoreLimit]
 
         with open(filePath, 'a') as file:
             file.write("# A network in a general multiplex format\n")
@@ -226,7 +255,7 @@ def writeLinks(self, sl, recreate=False):
             if self.debug is True:
                 print('\tWriting inter-layer links to file.')
             for _, row in dataframe.fillna('').iterrows():
-                authors = row[self.authorCol]
+                authors = row[self.authorCol].split(';')
                 paper = row[self.pubIDCol]
                 if paper not in self.nodeMap.keys():
                     print(f'Cannot find {paper}')
@@ -269,8 +298,8 @@ def writeLinks(self, sl, recreate=False):
                     except KeyError:
                         pass
 
-    def run(self, recreate=False):
-        """Create all data for slices."""
-        for sl in tqdm(self._createSlices()):
-            self.createNodeRegister(sl)
-            self.writeLinks(sl, recreate=recreate)
+    def run(self, recreate=False, windowsize=1, scorePath='./', outPath='./', scoreLimit=1.0):
+        """Create data for all slices."""
+        for sl in tqdm(self._createSlices(windowsize)):
+            self.createNodeRegister(sl, scorePath, scoreLimit)
+            self.writeLinks(sl, scorePath, scoreLimit, outpath=outPath, recreate=recreate)
diff --git a/src/semanticlayertools/utils/__init__.py b/src/semanticlayertools/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/utils/wordscorenet.py
new file mode 100644
index 0000000..41c2735
--- /dev/null
+++ b/src/semanticlayertools/utils/wordscorenet.py
@@ -0,0 +1 @@
+"""Runs all steps to create a multilayer network."""
diff --git a/tox.ini b/tox.ini
index 5af3319..ea55a51 100644
--- a/tox.ini
+++ b/tox.ini
@@ -18,6 +18,8 @@ commands = pytest {posargs}
 [testenv:docs]
 description = invoke sphinx-build to build the HTML docs
 basepython = python3.7
-deps = sphinx >= 1.7.5, < 2
+deps =
+    sphinx >= 1.7.5, < 2
+    sphinx_rtd_theme
 commands = sphinx-build -d "{toxworkdir}/docs_doctree" docs "{toxworkdir}/docs_out" --color -W -bhtml {posargs}
            python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))'

From 1cd58bc084dc198e85ca24e90b21431560d5c0f8 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 30 Sep 2021 13:52:02 +0200
Subject: [PATCH 06/53] add clustering and util fct

---
 setup.cfg                                     |  1 +
 src/semanticlayertools/clustering/__init__.py |  0
 src/semanticlayertools/clustering/infomap.py  | 45 +++++++++++++
 src/semanticlayertools/linkage/wordscore.py   | 13 +++-
 src/semanticlayertools/utils/wordscorenet.py  | 67 +++++++++++++++++++
 5 files changed, 123 insertions(+), 3 deletions(-)
 create mode 100644 src/semanticlayertools/clustering/__init__.py
 create mode 100644 src/semanticlayertools/clustering/infomap.py

diff --git a/setup.cfg b/setup.cfg
index 4f861e3..80ad574 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -26,6 +26,7 @@ install_requires =
   numpy
   spacy
   pandas
+  infomap
 
 [options.packages.find]
 where = src
diff --git a/src/semanticlayertools/clustering/__init__.py b/src/semanticlayertools/clustering/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/semanticlayertools/clustering/infomap.py b/src/semanticlayertools/clustering/infomap.py
new file mode 100644
index 0000000..375945e
--- /dev/null
+++ b/src/semanticlayertools/clustering/infomap.py
@@ -0,0 +1,45 @@
+import os
+from tqdm import tqdm
+import infomap
+
+
+class Clustering():
+    """Cluster using infomap."""
+
+    def __init__(
+        self,
+        infomapSettings="-N5 -imultilayer -fundirected"
+    ):
+        self.infomult = infomap.Infomap(infomapSettings)
+
+    def calcInfomap(self, inFilePath, outPath, recreate=False, debug=False):
+        """Calc clusters for one pajekt file."""
+        year = inFilePath.split(os.path.sep)[-1].split('_')[1].split('.')[0]
+        cluFilePath = f'{outPath}slice_{year}.clu'
+        ftreeFilePath = f'{outPath}slice_{year}.ftree'
+        if os.path.isfile(cluFilePath) or os.path.isfile(ftreeFilePath):
+            if recreate is False:
+                raise IOError(
+                    f'Files at {cluFilePath} or {ftreeFilePath} exists. Set recreate = True to rewrite files.'
+                    )
+            if recreate is True:
+                os.remove(cluFilePath)
+                os.remove(ftreeFilePath)
+        self.infomult.readInputData(inFilePath)
+        self.infomult.run()
+        self.infomult.writeClu(cluFilePath)
+        self.infomult.writeFlowTree(ftreeFilePath)
+        if debug:
+            print(
+                f"Clustered in {self.infomult.maxTreeDepth()} levels with codelength {self.infomult.codelength}"
+            )
+            print("\tDone: Slice {0}!".format(year))
+        return
+
+    def run(self, pajekPath='./', outPath='./', recreate=False, debug=False):
+        """Calculate infomap clustering for all pajek files in path."""
+        pajekFiles = sorted(
+            [pajekPath + x for x in os.listdir(pajekPath) if x.endswith('.net')]
+        )
+        for file in tqdm(pajekFiles):
+            self.calcInfomap(inFilePath=file, outPath=outPath, debug=debug)
diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py
index fee1dec..222d7bd 100644
--- a/src/semanticlayertools/linkage/wordscore.py
+++ b/src/semanticlayertools/linkage/wordscore.py
@@ -111,7 +111,7 @@ def getScore(self, target):
             target: 1/meta["counts"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"]))
         }
 
-    def run(self, write=False, outpath='./'):
+    def run(self, write=False, outpath='./', recreate=False):
         """Get score for all documents."""
         scores = {}
         self.getTermPatterns()
@@ -124,7 +124,15 @@ def run(self, write=False, outpath='./'):
             self.outputDict.update({key: tmpList})
         if write is True:
             for year, df in self.baseDF.groupby(self.yearCol):
-                with open(f'{outpath}{str(year)}.tsv', 'a') as yearfile:
+                filePath = f'{outpath}{str(year)}.tsv'
+                if os.path.isfile(filePath):
+                    if recreate is False:
+                        raise IOError(
+                            f'File at {filePath} exists. Set recreate = True to rewrite file.'
+                            )
+                    if recreate is True:
+                        os.remove(filePath)
+                with open(filePath, 'a') as yearfile:
                     for pub in df[self.pubIDCol].unique():
                         for elem in self.outputDict[pub]:
                             yearfile.write(f'{pub}\t{elem[0]}\t{elem[1]}\n')
@@ -195,7 +203,6 @@ def createNodeRegister(self, sl, scorePath, scoreLimit):
         ngramdataframe = ngramdataframe[ngramdataframe[2] > scoreLimit]
 
         authorList = [x for y in [x.split(';') for x in dataframe[self.authorCol].values] for x in y]
-
         authors = [x for x in set(authorList) if x]
         pubs = dataframe[self.pubIDCol].fillna('None').unique()
         ngrams = ngramdataframe[1].unique()
diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/utils/wordscorenet.py
index 41c2735..56c00d1 100644
--- a/src/semanticlayertools/utils/wordscorenet.py
+++ b/src/semanticlayertools/utils/wordscorenet.py
@@ -1 +1,68 @@
 """Runs all steps to create a multilayer network."""
+import tempfile
+from datetime import datetime
+import os
+
+from ..cleaning.text import htmlTags, lemmaSpacy
+from ..linkage.wordscore import CalculateScores, LinksOverTime
+from ..clustering.infomap import Clustering
+
+
+def run(
+    dataframe,
+    tempFiles=True,
+    outPath='./',
+    textColumn='text',
+    authorColumn='author',
+    pubIDColumn='publicationID',
+    scoreLimit=1.0
+):
+    """Run all steps for multilayer network generation using wordscoring."""
+    clean = dataframe[textColumn].apply(lambda row: lemmaSpacy(htmlTags(row)))
+
+    dataframe.insert(0, 'clean', clean)
+
+    score = CalculateScores(
+        dataframe,
+        textColumn='clean',
+        pubIDColumn=pubIDColumn
+    )
+    links = LinksOverTime(
+        dataframe,
+        authorColumn=authorColumn,
+        pubIDColumn=pubIDColumn
+    )
+    clusters = Clustering()
+    if tempFiles is True:
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sc, outDict = score.run(
+                write=True, outpath=f'{tmpdirname}/scores/', recreate=True
+            )
+            links.run(
+                recreate=True,
+                scorePath=f'{tmpdirname}/scores/',
+                outPath=f'{tmpdirname}/links/',
+                scoreLimit=scoreLimit
+            )
+            clusters.run(
+                pajekPath=f'{tmpdirname}/links/',
+                outPath=outPath,
+            )
+    else:
+        timestamp = datetime.now().strftime("_%Y_%m_%d")
+        basedir = outPath + timestamp
+        for subdir in ['scores', 'links', 'clusters']:
+            os.makedirs(basedir + subdir)
+        sc, outDict = score.run(
+            write=True, outpath=f'{basedir}/scores/', recreate=True
+        )
+        links.run(
+            recreate=True,
+            scorePath=f'{basedir}/scores/',
+            outPath=f'{basedir}/links/',
+            scoreLimit=scoreLimit
+        )
+        clusters.run(
+            pajekPath=f'{basedir}/links/',
+            outPath=f'{basedir}/clusters',
+        )

From d478b6abb06afcd9dfdef9164fd2421519d50a8b Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 30 Sep 2021 17:37:46 +0200
Subject: [PATCH 07/53] wip update utils to run pipeline

---
 src/semanticlayertools/clustering/infomap.py |  2 +-
 src/semanticlayertools/utils/wordscorenet.py | 45 +++++++-------------
 2 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/src/semanticlayertools/clustering/infomap.py b/src/semanticlayertools/clustering/infomap.py
index 375945e..f566d68 100644
--- a/src/semanticlayertools/clustering/infomap.py
+++ b/src/semanticlayertools/clustering/infomap.py
@@ -8,7 +8,7 @@ class Clustering():
 
     def __init__(
         self,
-        infomapSettings="-N5 -imultilayer -fundirected"
+        infomapSettings="-N5 -imultilayer -fundirected --silent"
     ):
         self.infomult = infomap.Infomap(infomapSettings)
 
diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/utils/wordscorenet.py
index 56c00d1..050495b 100644
--- a/src/semanticlayertools/utils/wordscorenet.py
+++ b/src/semanticlayertools/utils/wordscorenet.py
@@ -34,35 +34,22 @@ def run(
     )
     clusters = Clustering()
     if tempFiles is True:
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            sc, outDict = score.run(
-                write=True, outpath=f'{tmpdirname}/scores/', recreate=True
-            )
-            links.run(
-                recreate=True,
-                scorePath=f'{tmpdirname}/scores/',
-                outPath=f'{tmpdirname}/links/',
-                scoreLimit=scoreLimit
-            )
-            clusters.run(
-                pajekPath=f'{tmpdirname}/links/',
-                outPath=outPath,
-            )
+        basedir = tempfile.TemporaryDirectory().name
     else:
         timestamp = datetime.now().strftime("_%Y_%m_%d")
         basedir = outPath + timestamp
-        for subdir in ['scores', 'links', 'clusters']:
-            os.makedirs(basedir + subdir)
-        sc, outDict = score.run(
-            write=True, outpath=f'{basedir}/scores/', recreate=True
-        )
-        links.run(
-            recreate=True,
-            scorePath=f'{basedir}/scores/',
-            outPath=f'{basedir}/links/',
-            scoreLimit=scoreLimit
-        )
-        clusters.run(
-            pajekPath=f'{basedir}/links/',
-            outPath=f'{basedir}/clusters',
-        )
+    for subdir in ['scores', 'links', 'clusters']:
+        os.makedirs(os.path.join(basedir, subdir))
+    sc, outDict = score.run(
+        write=True, outpath=f'{basedir}/scores/', recreate=True
+    )
+    links.run(
+        recreate=True,
+        scorePath=f'{basedir}/scores/',
+        outPath=f'{basedir}/links/',
+        scoreLimit=scoreLimit
+    )
+    clusters.run(
+        pajekPath=f'{basedir}/links/',
+        outPath=f'{outPath}',
+    )

From f1c4c41e4346ec1c7516d4071948ed5b5a01794b Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Fri, 1 Oct 2021 13:52:30 +0200
Subject: [PATCH 08/53] wip updt origin, data testing complete, no tests
 written for clustering yet

---
 src/semanticlayertools/utils/wordscorenet.py | 30 +++++++++++++++++---
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/utils/wordscorenet.py
index 050495b..2aae469 100644
--- a/src/semanticlayertools/utils/wordscorenet.py
+++ b/src/semanticlayertools/utils/wordscorenet.py
@@ -13,6 +13,7 @@ def run(
     tempFiles=True,
     outPath='./',
     textColumn='text',
+    yearColumn='year',
     authorColumn='author',
     pubIDColumn='publicationID',
     scoreLimit=1.0
@@ -30,17 +31,20 @@ def run(
     links = LinksOverTime(
         dataframe,
         authorColumn=authorColumn,
-        pubIDColumn=pubIDColumn
+        pubIDColumn=pubIDColumn,
+        yearColumn=yearColumn
     )
     clusters = Clustering()
     if tempFiles is True:
         basedir = tempfile.TemporaryDirectory().name
+        clusterout = outPath
     else:
         timestamp = datetime.now().strftime("_%Y_%m_%d")
-        basedir = outPath + timestamp
+        basedir = outPath + 'Clustering' + timestamp
+        clusterout = f'{basedir}/clusters/'
     for subdir in ['scores', 'links', 'clusters']:
         os.makedirs(os.path.join(basedir, subdir))
-    sc, outDict = score.run(
+    score.run(
         write=True, outpath=f'{basedir}/scores/', recreate=True
     )
     links.run(
@@ -51,5 +55,23 @@ def run(
     )
     clusters.run(
         pajekPath=f'{basedir}/links/',
-        outPath=f'{outPath}',
+        outPath=clusterout,
     )
+    with open(f'{basedir}/README.txt', 'w+') as file:
+        file.write(
+            f"""Run of clustering {datetime.now().strftime("%Y_%m_%d")}
+
+            Text cleaned in column: {textColumn} (html tags removed and lemmatized)
+            Authors information from column: {authorColumn}
+            Unique publication IDs from columns: {pubIDColumn}
+            Ngram scores greater {scoreLimit} were considered for link creation.
+            Clustering result in folder: {clusterout}
+            """
+        )
+        if tempFiles is True:
+            file.write(
+                'Temporay files for wordscores and multilayer network were deleted.'
+            )
+    print(f"""Results in {clusterout}.\n
+    Head over to https://www.mapequation.org/alluvial/ to visualize the ftree files.
+        """)

From 1ba08e4dd32f4fffd9e93b2a89baa5b29f19dbb5 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 4 Oct 2021 16:31:57 +0200
Subject: [PATCH 09/53] fix multiprocessing

---
 src/semanticlayertools/linkage/wordscore.py  | 75 +++++++++++---------
 src/semanticlayertools/utils/wordscorenet.py | 36 +++++++---
 2 files changed, 65 insertions(+), 46 deletions(-)

diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py
index 222d7bd..84bdaa7 100644
--- a/src/semanticlayertools/linkage/wordscore.py
+++ b/src/semanticlayertools/linkage/wordscore.py
@@ -2,7 +2,7 @@
 import re
 from collections import Counter, defaultdict
 from itertools import islice, combinations
-
+from multiprocessing import Pool, cpu_count
 from tqdm import tqdm
 import numpy as np
 import pandas as pd
@@ -41,7 +41,15 @@ class CalculateScores():
     :type ngramsize: int
     """
 
-    def __init__(self, sourceDataframe, textColumn="text", pubIDColumn="pubID", yearColumn='year',  ngramsize=5,):
+    def __init__(
+        self,
+        sourceDataframe,
+        textColumn="text",
+        pubIDColumn="pubID",
+        yearColumn='year',
+        ngramsize=5,
+        debug=False
+    ):
 
         self.baseDF = sourceDataframe
         self.textCol = textColumn
@@ -51,8 +59,9 @@ def __init__(self, sourceDataframe, textColumn="text", pubIDColumn="pubID", year
         self.outputDict = {}
         self.allNGrams = []
         self.counts = {}
-        self.allgramslist = []
+        self.corpussize = 1
         self.uniqueNGrams = ()
+        self.debug=debug
 
     def getTermPatterns(self):
         """Create dictionaries of occuring ngrams."""
@@ -72,51 +81,47 @@ def getTermPatterns(self):
             self.outputDict[row[self.pubIDCol]] = tempNGram
         self.allNGrams = allNGrams
         allgrams = [x for y in [y for x, y in self.allNGrams.items()] for x in y]
-        self.allgramslist = allgrams
+        self.corpussize = len(allgrams)
         self.counts = Counter(allgrams)
         self.uniqueNGrams = set(allgrams)
 
     def getScore(self, target):
         """Calculate ngram score."""
-        meta = {
-            "target": target,
-            "counts": self.counts[target],
-            "corpusL": len(self.allgramslist),
-            "maxL": len(target),
-        }
-
-        res = defaultdict(list)
-
-        for idx, subgram in enumerate(target):
-            key = idx + 1
-            for tup in self.allNGrams[2]:
-                if tup[1:][0] == subgram:
-                    res[f"l_{key}"].append(tup[:1][0])
-                elif tup[:-1][0] == subgram:
-                    res[f"r_{key}"].append(tup[1:][0])
         valueList = []
-        for L in range(1, meta["maxL"] + 1, 1):
-            leftkey = f"l_{L}"
-            rightkey = f"r_{L}"
-            if rightkey not in res.keys():
-                rvalue = 0
-            else:
-                rvalue = len(list(set(res[rightkey])))
-            if leftkey not in res.keys():
-                lvalue = 0
-            else:
-                lvalue = len(list(set(res[leftkey])))
+        for _, subgram in enumerate(target):
+            contains = [x for x in self.allNGrams[2] if subgram in x]
+            rvalue = len(set(x for x in contains if x[0] == subgram))
+            lvalue = len(set(x for x in contains if x[1] == subgram))
             valueList.append((lvalue + 1) * (rvalue + 1))
         return {
-            target: 1/meta["counts"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"]))
+            target: 1/self.counts[target] * (np.prod(valueList)) ** (1 / (2.0 * len(target)))
         }
 
-    def run(self, write=False, outpath='./', recreate=False):
+    def _calcBatch(self, batch):
+        res = []
+        for elem in tqdm(batch):
+            res.append(self.getScore(elem))
+        return res
+
+    def run(self, write=False, outpath='./', recreate=False, limitCPUs=True):
         """Get score for all documents."""
         scores = {}
         self.getTermPatterns()
-        for target in tqdm(self.uniqueNGrams):
-            scores.update(self.getScore(target))
+        if self.debug is True:
+            print(f'Found {len(self.uniqueNGrams)} unique {self.ngramEnd}-grams.')
+        if limitCPUs is True:
+            ncores = int(cpu_count()*1/4)
+        else:
+            ncores = cpu_count() - 2
+        pool = Pool(ncores)
+        chunk_size = int(len(self.uniqueNGrams)/ncores)
+        batches = [
+            list(self.uniqueNGrams)[i:i+chunk_size] for i in range(0, len(self.uniqueNGrams), chunk_size)
+        ]
+        ncoresResults = pool.map(self._calcBatch, batches)
+        results = [x for y in ncoresResults for x in y]
+        for elem in results:
+            scores.update(elem)
         for key, val in self.outputDict.items():
             tmpList = []
             for elem in val:
diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/utils/wordscorenet.py
index 2aae469..5996fda 100644
--- a/src/semanticlayertools/utils/wordscorenet.py
+++ b/src/semanticlayertools/utils/wordscorenet.py
@@ -16,17 +16,33 @@ def run(
     yearColumn='year',
     authorColumn='author',
     pubIDColumn='publicationID',
+    ngramsize=5,
     scoreLimit=1.0
 ):
     """Run all steps for multilayer network generation using wordscoring."""
+
+    if tempFiles is True:
+        basedir = tempfile.TemporaryDirectory().name
+        clusterout = outPath
+    else:
+        timestamp = datetime.now().strftime("_%Y_%m_%d")
+        basedir = outPath + 'Clustering' + timestamp
+        clusterout = f'{basedir}/clusters/'
+    for subdir in ['scores', 'links', 'clusters']:
+        os.makedirs(os.path.join(basedir, subdir))
+    print(f'Start cleaning {textColumn} column.')
     clean = dataframe[textColumn].apply(lambda row: lemmaSpacy(htmlTags(row)))
 
     dataframe.insert(0, 'clean', clean)
 
+    if tempFiles is False:
+        dataframe.to_json(f'{basedir}/sourceDFcleaned.json', orient='records', lines=True)
+    print('\tDone.')
     score = CalculateScores(
         dataframe,
         textColumn='clean',
-        pubIDColumn=pubIDColumn
+        pubIDColumn=pubIDColumn,
+        ngramsize=ngramsize
     )
     links = LinksOverTime(
         dataframe,
@@ -35,28 +51,26 @@ def run(
         yearColumn=yearColumn
     )
     clusters = Clustering()
-    if tempFiles is True:
-        basedir = tempfile.TemporaryDirectory().name
-        clusterout = outPath
-    else:
-        timestamp = datetime.now().strftime("_%Y_%m_%d")
-        basedir = outPath + 'Clustering' + timestamp
-        clusterout = f'{basedir}/clusters/'
-    for subdir in ['scores', 'links', 'clusters']:
-        os.makedirs(os.path.join(basedir, subdir))
+
+    print(f'Start calculating scores for {dataframe.shape[0]} texts.')
     score.run(
         write=True, outpath=f'{basedir}/scores/', recreate=True
     )
+    print('\tDone.')
+    print(f'Start creating links with scoreLimit > {scoreLimit}.')
     links.run(
         recreate=True,
         scorePath=f'{basedir}/scores/',
         outPath=f'{basedir}/links/',
         scoreLimit=scoreLimit
     )
+    print('\tDone.')
+    print('Start calculating infomap clusters.')
     clusters.run(
         pajekPath=f'{basedir}/links/',
         outPath=clusterout,
     )
+    print('\tDone.')
     with open(f'{basedir}/README.txt', 'w+') as file:
         file.write(
             f"""Run of clustering {datetime.now().strftime("%Y_%m_%d")}
@@ -70,7 +84,7 @@ def run(
         )
         if tempFiles is True:
             file.write(
-                'Temporay files for wordscores and multilayer network were deleted.'
+                'Temporay files for wordscores and multilayer networks were deleted.'
             )
     print(f"""Results in {clusterout}.\n
     Head over to https://www.mapequation.org/alluvial/ to visualize the ftree files.

From 7c983f8470acbb84a828751923c9398e82d35c1f Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 9 Dec 2021 18:13:59 +0100
Subject: [PATCH 10/53] add routine for cocitations

---
 src/semanticlayertools/linkage/cocitation.py | 66 ++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 src/semanticlayertools/linkage/cocitation.py

diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py
new file mode 100644
index 0000000..12b5056
--- /dev/null
+++ b/src/semanticlayertools/linkage/cocitation.py
@@ -0,0 +1,66 @@
+"""Link documents by cocitation."""
+import os
+import time
+import tempfile
+import multiprocessing
+from itertools import combinations
+from collections import Counter
+
+import pandas as pd
+import numpy as np
+
+num_processes = multiprocessing.cpu_count()
+
+
+class Cocitations():
+    """Cocitation calculations."""
+
+    def __init__(
+        self, inpath, outpath, columnName, numberProc=num_processes, debug=False
+    ):
+        self.inpath = inpath
+        self.outpath = outpath
+        self.columnName = columnName
+        self.numberProc = numberProc
+        self.debug = debug
+
+    def getCombinations(self, chunk):
+        """Calculate combinations."""
+        res = []
+        for idx, row in chunk.iterrows():
+            comb = combinations(row[self.columnName], 2)
+            for elem in list(comb):
+                res.append((elem))
+        return res
+
+    def calculateCoCitation(self, filepath):
+        """Do calculation for input file."""
+        infilename = filepath.split(os.path.sep)[-1].split('.')[0]
+        starttime = time.time()
+        try:
+            data = pd.read_json(filepath, lines=True).dropna(subset=[self.columnName])
+            chunk_size = int(data.shape[0] / self.numberProc)
+            chunks = np.array_split(data, chunk_size)
+            pool = multiprocessing.Pool(processes=self.numberProc)
+            cocitations = pool.map(self.getCombinations, chunks)
+            cocitCounts = Counter([x for y in cocitations for x in y])
+            sortCoCitCounts = cocitCounts.most_common()
+            with open(self.outpath + infilename + '.csv', 'w') as outfile:
+                for edge in sortCoCitCounts:
+                    outfile.write(f"{edge[0][0]},{edge[0][1]},{edge[1]}\n")
+        except:
+            raise
+        if self.debug == "l2":
+            print(f'\tDone in {starttime - time.time()} seconds.')
+        return
+
+    def processFolder(self):
+        """Calculate cocitation for all files in folder."""
+        starttime = time.time()
+        for file in os.listdir(self.inpath):
+            try:
+                self.calculateCoCitation(os.path.join(self.inpath, file))
+            except:
+                raise
+        if self.debug is True:
+            print(f'\tDone in {starttime - time.time()} seconds.')

From baea9a12909570ccc4a675fb06527f2442776aa9 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Fri, 10 Dec 2021 13:46:59 +0100
Subject: [PATCH 11/53] add giant component writing

---
 src/semanticlayertools/linkage/cocitation.py | 40 ++++++++++++++++----
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py
index 12b5056..58ffa9f 100644
--- a/src/semanticlayertools/linkage/cocitation.py
+++ b/src/semanticlayertools/linkage/cocitation.py
@@ -1,33 +1,45 @@
 """Link documents by cocitation."""
 import os
 import time
-import tempfile
 import multiprocessing
 from itertools import combinations
 from collections import Counter
+from typing import TypeVar
 
+import igraph as ig
 import pandas as pd
 import numpy as np
 
 num_processes = multiprocessing.cpu_count()
 
+limitRefLength = TypeVar('limitRefLength', bool, int)
+debugVar = TypeVar('debugVar', bool, str)
 
 class Cocitations():
     """Cocitation calculations."""
 
     def __init__(
-        self, inpath, outpath, columnName, numberProc=num_processes, debug=False
+        self, inpath, outpath, columnName,
+        numberProc: int=num_processes, limitRefLength: limitRefLength=False, debug: debugVar=False,
     ):
         self.inpath = inpath
         self.outpath = outpath
         self.columnName = columnName
         self.numberProc = numberProc
+        self.limitRefLength = limitRefLength
         self.debug = debug
 
     def getCombinations(self, chunk):
         """Calculate combinations."""
         res = []
-        for idx, row in chunk.iterrows():
+        if type(self.limitRefLength) == int:
+            reflen = chunk[self.columnName].apply(
+                lambda x: True if type(x)==list and len(x)<=self.limitRefLength else False
+            )
+            data = chunk[reflen].copy()
+        else:
+            data = chunk.copy()
+        for idx, row in data.iterrows():
             comb = combinations(row[self.columnName], 2)
             for elem in list(comb):
                 res.append((elem))
@@ -44,14 +56,26 @@ def calculateCoCitation(self, filepath):
             pool = multiprocessing.Pool(processes=self.numberProc)
             cocitations = pool.map(self.getCombinations, chunks)
             cocitCounts = Counter([x for y in cocitations for x in y])
-            sortCoCitCounts = cocitCounts.most_common()
-            with open(self.outpath + infilename + '.csv', 'w') as outfile:
+            sortCoCitCounts = [
+                (x[0][0], x[0][1], x[1]) for x in cocitCounts.most_common()
+            ]
+            tempG = ig.Graph.TupleList(sortCoCitCounts, weights=True, vertex_name_attr='id')
+            components = tempG.components()
+            sortedComponents = sorted(
+                [(x, len(x)) for x in components], key=lambda x: x[1], reverse=True
+            )
+            giantComponent = sortedComponents[0]
+            giantComponentGraph = tempG.vs.select(giantComponent[0]).subgraph()
+            giantComponentGraph.write_pajek(
+                os.path.join(self.outpath,infilename + '_GC.net')
+            )
+            with open(os.path.join(self.outpath,infilename + '.ncol'), 'w') as outfile:
                 for edge in sortCoCitCounts:
-                    outfile.write(f"{edge[0][0]},{edge[0][1]},{edge[1]}\n")
+                    outfile.write(f"{edge[0]} {edge[1]} {edge[2]}\n")
         except:
             raise
         if self.debug == "l2":
-            print(f'\tDone in {starttime - time.time()} seconds.')
+            print(f'\tDone in {time.time() - starttime} seconds.')
         return
 
     def processFolder(self):
@@ -63,4 +87,4 @@ def processFolder(self):
             except:
                 raise
         if self.debug is True:
-            print(f'\tDone in {starttime - time.time()} seconds.')
+            print(f'\tDone in {time.time() - starttime} seconds.')

From 532e56ef21b58239928e6c7bde7afe0cd4356a07 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Wed, 15 Dec 2021 07:15:12 +0100
Subject: [PATCH 12/53] upd orig

---
 src/semanticlayertools/linkage/cocitation.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py
index 58ffa9f..a63e967 100644
--- a/src/semanticlayertools/linkage/cocitation.py
+++ b/src/semanticlayertools/linkage/cocitation.py
@@ -9,6 +9,7 @@
 import igraph as ig
 import pandas as pd
 import numpy as np
+from tqdm import tqdm
 
 num_processes = multiprocessing.cpu_count()
 
@@ -62,8 +63,16 @@ def calculateCoCitation(self, filepath):
             tempG = ig.Graph.TupleList(sortCoCitCounts, weights=True, vertex_name_attr='id')
             components = tempG.components()
             sortedComponents = sorted(
-                [(x, len(x)) for x in components], key=lambda x: x[1], reverse=True
+                [(x, len(x), len(x)*100/len(tempG.vs)) for x in components], key=lambda x: x[1], reverse=True
             )
+            with open(os.path.join(self.outpath,infilename + '_graphMetadata.txt'), 'w') as outfile:
+                outfile.write(f'Graph derived from {filepath}\nSummary:\n')
+                outfile.write(tempG.summary() + '\n\nComponents (ordered by size):\n\n')
+                for idx, elem in enumerate(sortedComponents):
+                    gcompTemp = tempG.vs.select(elem[0]).subgraph()
+                    outfile.write(
+                        f"{idx}:\n\t{elem[1]} nodes ({elem[2]:.3f}% of full graph)\n\t{len(gcompTemp.es)} edges ({len(gcompTemp.es)*100/len(tempG.es):.3f}% of full graph)\n\n"
+                    )
             giantComponent = sortedComponents[0]
             giantComponentGraph = tempG.vs.select(giantComponent[0]).subgraph()
             giantComponentGraph.write_pajek(
@@ -81,7 +90,7 @@ def calculateCoCitation(self, filepath):
     def processFolder(self):
         """Calculate cocitation for all files in folder."""
         starttime = time.time()
-        for file in os.listdir(self.inpath):
+        for file in tqdm(os.listdir(self.inpath)):
             try:
                 self.calculateCoCitation(os.path.join(self.inpath, file))
             except:

From c885446ef4dbb740ffb08de1c1f8444e662a133c Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Wed, 15 Dec 2021 15:23:00 +0100
Subject: [PATCH 13/53] add leiden time clusters and streamgraph visuals, WIP

---
 setup.cfg                                    |   5 +-
 src/semanticlayertools/clustering/leiden.py  | 118 +++++++++++++++++++
 src/semanticlayertools/linkage/cocitation.py |   2 +-
 src/semanticlayertools/visual/__init__.py    |   0
 src/semanticlayertools/visual/utils.py       |  38 ++++++
 5 files changed, 161 insertions(+), 2 deletions(-)
 create mode 100644 src/semanticlayertools/clustering/leiden.py
 create mode 100644 src/semanticlayertools/visual/__init__.py
 create mode 100644 src/semanticlayertools/visual/utils.py

diff --git a/setup.cfg b/setup.cfg
index 80ad574..f990da3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = semanticlayertools
-version = 0.0.1
+version = 0.0.3
 author = Malte Vogl
 author_email = mvogl@mpiwg-berlin.mpg.de
 description = Create semantic layers using different methods for word linking.
@@ -22,11 +22,14 @@ packages = find:
 python_requires = >=3.7
 install_requires =
   tqdm
+  matplotlib
   nltk
   numpy
   spacy
   pandas
   infomap
+  igraph
+  leidenalg
 
 [options.packages.find]
 where = src
diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py
new file mode 100644
index 0000000..31b43ea
--- /dev/null
+++ b/src/semanticlayertools/clustering/leiden.py
@@ -0,0 +1,118 @@
+import os
+import time
+import re
+from typing import TypeVar
+
+from tqdm import tqdm
+
+import igraph as ig
+import leidenalg as la
+
+debugVar = TypeVar('debugVar', bool, str)
+
+
+class TimeCluster():
+    """Cluster time-sliced data with the Leiden algorithm."""
+
+    def __init__(
+        self, inpath: str, outpath: str,
+        resolution: float = 0.003, intersliceCoupling: float = 0.4,
+        timerange: tuple = (1945, 2005),
+        debug: debugVar = False
+    ):
+        starttime = time.time()
+        self.inpath = inpath
+        self.outpath = outpath
+        self.res_param = resolution
+        self.interslice_param = intersliceCoupling
+        self.timerange = timerange
+        self.debug = debug
+
+        self.outfile = os.path.join(
+            outpath,
+            f'timeclusters_{timerange[0]}-{timerange[1]}_res_{resolution}_intersl_{intersliceCoupling}.csv'
+        )
+        if os.path.isfile(self.outfile):
+            raise OSError('Output file exists. Please remove.')
+
+        edgefiles = [x for x in os.listdir(inpath) if x.endswith('_GC.net')]
+
+        self.graphDict = {}
+
+        for idx in tqdm(range(len(edgefiles))):
+            try:
+                year = re.findall(r'\d{4}', edgefiles[idx])[0]
+            except:
+                raise
+            if timerange[0] <= int(year) <= timerange[1]:
+                graph = ig.Graph.Read_Pajek(os.path.join(inpath, edgefiles[idx]))
+                self.graphDict[year] = graph
+
+        self.optimiser = la.Optimiser()
+
+        print(
+            "Graphs between "
+            f"{min(list(self.graphDict.keys()))} and "
+            f"{max(list(self.graphDict.keys()))} "
+            f"loaded in {time.time() - starttime} seconds."
+        )
+
+    def optimize(self):
+        """Optimize clusters accross time slices."""
+        starttime = time.time()
+
+        layers, interslice_layer, _ = la.time_slices_to_layers(
+            list(self.graphDict.values()),
+            interslice_weight=self.interslice_param,
+            vertex_id_attr='name'
+        )
+        print('\tHave set layers.')
+
+        partitions = [
+            la.CPMVertexPartition(
+                H,
+                node_sizes='node_size',
+                weights='weight',
+                resolution_parameter=self.res_param
+            ) for H in layers
+        ]
+        print('\tHave set partitions.')
+
+        interslice_partition = la.CPMVertexPartition(
+            interslice_layer,
+            resolution_parameter=0,
+            node_sizes='node_size',
+            weights='weight'
+        )
+        print('\tHave set interslice partions.')
+
+        self.optimiser.optimise_partition_multiplex(
+            partitions + [interslice_partition]
+        )
+
+        subgraphs = interslice_partition.subgraphs()
+
+        commun = []
+        for idx, part in enumerate(subgraphs):
+            nodevals = [
+                (
+                    x['name'],
+                    list(self.graphDict.keys()).pop(x['slice']),
+                    idx
+                ) for x in part.vs
+            ]
+            commun.extend(nodevals)
+
+        with open(self.outfile, 'w') as outfile:
+            outfile.write('node,year,cluster\n')
+            for elem in commun:
+                outfile.write(
+                    f"{elem[0]},{elem[1]},{elem[2]}\n"
+                )
+
+        print(
+            f'Finished in {time.time() - starttime} seconds.'
+            f"Found {len(subgraphs)} clusters."
+        )
+
+        return commun
diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py
index a63e967..a78a745 100644
--- a/src/semanticlayertools/linkage/cocitation.py
+++ b/src/semanticlayertools/linkage/cocitation.py
@@ -17,7 +17,7 @@
 debugVar = TypeVar('debugVar', bool, str)
 
 class Cocitations():
-    """Cocitation calculations."""
+    """Create cocitation networks."""
 
     def __init__(
         self, inpath, outpath, columnName,
diff --git a/src/semanticlayertools/visual/__init__.py b/src/semanticlayertools/visual/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py
new file mode 100644
index 0000000..a2eff8d
--- /dev/null
+++ b/src/semanticlayertools/visual/utils.py
@@ -0,0 +1,38 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+
+
+def streamgraph(filepath):
+    """Plot streamgraph of cluster sizes vs years."""
+    basedf = pd.read_csv(filepath)
+    basedata = basedf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index()
+    yearbase = [
+        str(x) for x in range(
+            int(basedata.year.min()), int(basedata.year.max()) + 1
+        )
+    ]
+    cluDict = {}
+    for clu in basedata.cluster.unique():
+        cluvec = []
+        basedf = basedata.query('cluster == @clu')
+        baseyears = basedf.year.unique()
+        for year in yearbase:
+            if year in baseyears:
+                cluvec.append(basedf.query('year == @year').counts.iloc[0])
+            else:
+                cluvec.append(0)
+        cluDict[clu] = cluvec
+
+    fig, ax = plt.subplots(figsize=(10, 7))
+    ax.stackplot(
+        yearbase,
+        cluDict.values(),
+        labels=cluDict.keys(),
+        baseline='sym'
+    )
+    ax.set_title('Cluster sizes')
+    ax.set_xlabel('Year')
+    ax.set_ylabel('Number of publications')
+    ax.axhline(0, color="black", ls="--")
+
+    return fig

From b481ef905e183c4f2e332b987458c07c92bb50f4 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 16 Dec 2021 14:46:28 +0100
Subject: [PATCH 14/53] finish streamgraph, add routine for reports, WIP

---
 setup.cfg                                     |   2 +
 src/semanticlayertools/clustering/reports.py  | 169 ++++++++++++++++++
 .../{utils => pipelines}/__init__.py          |   0
 .../{utils => pipelines}/wordscorenet.py      |   0
 src/semanticlayertools/visual/utils.py        |  82 ++++++---
 tox.ini                                       |   2 +-
 6 files changed, 232 insertions(+), 23 deletions(-)
 create mode 100644 src/semanticlayertools/clustering/reports.py
 rename src/semanticlayertools/{utils => pipelines}/__init__.py (100%)
 rename src/semanticlayertools/{utils => pipelines}/wordscorenet.py (100%)

diff --git a/setup.cfg b/setup.cfg
index f990da3..9a67afc 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -25,7 +25,9 @@ install_requires =
   matplotlib
   nltk
   numpy
+  scipy
   spacy
+  textacy
   pandas
   infomap
   igraph
diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py
new file mode 100644
index 0000000..5110510
--- /dev/null
+++ b/src/semanticlayertools/clustering/reports.py
@@ -0,0 +1,169 @@
+import re
+import os
+from tqdm import tqdm
+
+import spacy
+import textacy
+import textacy.tm
+import pandas as pd
+import multiprocessing
+
+num_processes = multiprocessing.cpu_count()
+
+mainLanguageCorp = 'en_core_web_lg'
+nlp = spacy.load(mainLanguageCorp)
+
+
+class ClusterReports():
+
+    def __init__(
+        self, infile:str, metadatapath:str, outpath:str, iteration: int,
+        numberProc: int=num_processes, minClusterSize: int=1000
+    ):
+        self.iteration = iteration
+        self.numberProc = numberProc
+        self.minClusterSize = minClusterSize
+        self.metadatapath = metadatapath
+        self.outpath = outpath
+
+        self.clusterdf = pd.read_csv(infile)
+        basedata = self.clusterdf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index()
+        largeClusterList = list(basedata.groupby('cluster').sum().query(f'counts > {self.minClusterSize}').index)
+
+        self.clusternodes = self.clusterdf.query(
+            'cluster in @largeClusterList'
+        )
+
+    def create_corpus(self, dataframe):
+        """Create corpus out of dataframe."""
+        docs = []
+        titles = [x[0] for x in dataframe.title.values if type(x) == list]
+        for title in tqdm(titles):
+            try:
+                # text pre-processing
+                title = re.sub("\n", " ", title)
+                title = re.sub("[\r|\t|\x0c|\d+]", "", title)
+                title = re.sub("[.,]", "", title)
+                title = re.sub("\\\'s", "'s", title)
+                title = title.lower()
+
+                doc = nlp(title)
+
+                tokens_without_sw = ' '.join([t.lemma_ for t in doc if not t.is_stop])
+
+                docs.append(tokens_without_sw)
+            except:
+                print(title)
+                raise
+
+        corpus_titles = textacy.Corpus(mainLanguageCorp, data=docs)
+        return corpus_titles
+
+
+    def find_topics(self, corpus_titles: list, n_topics: int, top_words: int, outpath:str='./', writeReport: bool=False):
+        """Calculate topics in corpus."""
+        vectorizer = textacy.representations.vectorizers.Vectorizer(
+            tf_type="linear",
+            idf_type="smooth",
+            norm="l2",
+            min_df=2,
+            max_df=0.95
+        )
+        tokenized_docs = (
+            (
+                term.lemma_ for term in textacy.extract.terms(doc, ngs=1, ents=True)
+            ) for doc in corpus_titles
+        )
+        doc_term_matrix = vectorizer.fit_transform(tokenized_docs)
+
+        model = textacy.tm.TopicModel("nmf", n_topics)
+        model.fit(doc_term_matrix)
+
+        doc_topic_matrix = model.transform(doc_term_matrix)
+
+        topics = []
+        for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_words):
+            topics.append("topic " + str(topic_idx) + ": " + "   ".join(top_terms))
+        if writeReport is True:
+            outfile.write(f'\n\n\tTopics in cluster for {n_topics} topics:\n')
+            for topic in topics:
+                outfile.write(f'\t\t{topic}\n')
+        else:
+            print("\nTopics in the cluster:\n")
+            return topics
+
+    def fullReport(self, cluster):
+        """Generate full cluster report."""
+        with open(f'{outFolderReports}Report_{cluster}.txt', 'a') as outfile:
+            selection = self.clusterdf.query('cluster == @cluster')
+            nodeList = list(set(selection.node.values))
+            starttime = time.time()
+            result = {}
+            resultMeta = []
+            for key, vals in groupby(sorted(nodeList), lambda x: x[:4]):
+                result[int(key)] = list(vals)
+
+            for key in result.keys():
+                if key > 1949 and key < 2005 and key != 1996:
+                    yeardata = pd.read_json(f'{inFolderMetadata}{key}_meta.json', lines=True)
+                    selectNodedata = yeardata[yeardata.nodeID.isin(result[key])]
+                    resultMeta.append(selectNodedata)
+
+            metadata = pd.concat(resultMeta)
+            metadata.to_json(
+                f'{outFolderReports}meta/cluster_{cluster}_meta.json',
+                orient='records',
+                lines=True
+            )
+            foundNodes = [x[0] for x in metadata.bibcode.values]
+            notFound = [x for x in nodeList if x not in foundNodes]
+
+            outfile.write(
+                f'\tGot {len(nodeList)} unique publications in time range:\
+                 {selection.year.min()} to {selection.year.max()}.\n'
+            )
+            outfile.write(
+                f'\t\tFound metadata for {metadata.shape[0]} publications.\n'
+            )
+            outfile.write(
+                f'\t\tThere are {len([x for x in foundNodes if x not in nodeList])}\
+                found publications which where NOT in the query list.\n'
+            )
+            outfile.write(
+                f'\t\tThere are {len(notFound)} publication(s) which where NOT found:\n'
+            )
+
+            topAuthors = Counter(
+                [x for y in [x for x in metadata.author.values if type(x) == list] for x in y]
+            ).most_common(20)
+            outfile.write('\n\tThe top authors of this cluster are:\n')
+            for elem in topAuthors:
+                outfile.write(f'\t\t{elem[0]}: {elem[1]} pubs\n.')
+            topAffils = Counter(
+                [x for y in [x for x in metadata.aff.values if type(x) == list] for x in y]
+            ).most_common(20)
+            outfile.write('\n\tThe top 20 affiliations of this cluster are:\n')
+            for elem in topAffils:
+                outfile.write(f'\t\t{elem[0]}: {elem[1]} authors.\n')
+            outfile.write(
+                f'\n\n\tFinished analysis of cluster {cluster} with {len(nodeList)}\
+                 unique publications in {time.time()- starttime} seconds.\n\n'
+            )
+            corpus = create_corpus(metadata)
+            find_topics(
+                corpus, n_topics=15, top_words=10, writeReport=True, outfile=outfile
+            )
+            find_topics(
+                corpus, n_topics=50, top_words=10, writeReport=True, outfile=outfile
+            )
+            outfile.write(
+                f'\n\n\tFinished analysis of topics in {cluster} in {time.time()- starttime} seconds.\n\n'
+            )
+            return cluster
+
+    def processClusters(self, publicationIDcolumn: str='nodeID'):
+        for filename in tqdm(os.listdir(self.metadatapath)):
+            filepath = os.path.join(self.metadatapath, filename)
+            data = pd.read_json(filepath, lines=True)
+            selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner')
+            selectMerge.to_json(os.path.join(self.outpath, 'merge_' + filename) , orient='records', lines=True)
diff --git a/src/semanticlayertools/utils/__init__.py b/src/semanticlayertools/pipelines/__init__.py
similarity index 100%
rename from src/semanticlayertools/utils/__init__.py
rename to src/semanticlayertools/pipelines/__init__.py
diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/pipelines/wordscorenet.py
similarity index 100%
rename from src/semanticlayertools/utils/wordscorenet.py
rename to src/semanticlayertools/pipelines/wordscorenet.py
diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py
index a2eff8d..0f79bdf 100644
--- a/src/semanticlayertools/visual/utils.py
+++ b/src/semanticlayertools/visual/utils.py
@@ -1,38 +1,76 @@
 import matplotlib.pyplot as plt
 import pandas as pd
+import numpy as np
+from scipy import stats
+from typing import TypeVar
 
+smoothing = TypeVar('smoothing', bool, float)
 
-def streamgraph(filepath):
-    """Plot streamgraph of cluster sizes vs years."""
+
+def gaussian_smooth(x, y, grid, sd):
+    weights = np.transpose([stats.norm.pdf(grid, m, sd) for m in x])
+    weights = weights / weights.sum(0)
+    return (weights * y).sum(1)
+
+
+def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000, showNthGrid: int=5):
+    """Plot streamgraph of cluster sizes vs years.
+
+    Based on https://www.python-graph-gallery.com/streamchart-basic-matplotlib
+    """
     basedf = pd.read_csv(filepath)
     basedata = basedf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index()
     yearbase = [
-        str(x) for x in range(
+        x for x in range(
             int(basedata.year.min()), int(basedata.year.max()) + 1
         )
     ]
+    largeclu = list(basedata.groupby('cluster').sum().query(f'counts > {minClusterSize}').index)
     cluDict = {}
     for clu in basedata.cluster.unique():
-        cluvec = []
-        basedf = basedata.query('cluster == @clu')
-        baseyears = basedf.year.unique()
-        for year in yearbase:
-            if year in baseyears:
-                cluvec.append(basedf.query('year == @year').counts.iloc[0])
-            else:
-                cluvec.append(0)
-        cluDict[clu] = cluvec
-
-    fig, ax = plt.subplots(figsize=(10, 7))
-    ax.stackplot(
-        yearbase,
-        cluDict.values(),
-        labels=cluDict.keys(),
-        baseline='sym'
-    )
+        if clu in largeclu:
+            cluvec = []
+            basedf = basedata.query('cluster == @clu')
+            baseyears = list(basedf.year.unique())
+            for year in yearbase:
+                if year in baseyears:
+                    cluvec.append(basedf.query('year == @year').counts.iloc[0])
+                else:
+                    cluvec.append(0)
+            cluDict[clu] = cluvec
+
+    fig, ax = plt.subplots(figsize=(16, 9))
+    if type(smooth) is float:
+        grid = np.linspace(yearbase[0], yearbase[-1], num=100)
+        y = [np.array(x) for x in cluDict.values()]
+        y_smoothed = [gaussian_smooth(yearbase, y_, grid, smooth) for y_ in y]
+        ax.stackplot(
+            grid,
+            y_smoothed,
+            labels=cluDict.keys(),
+            baseline="sym"
+            ,colors=plt.get_cmap('tab20').colors
+        )
+
+        pass
+    else:
+        ax.stackplot(
+            yearbase,
+            cluDict.values(),
+            labels=cluDict.keys(),
+            baseline='sym',
+            colors=plt.get_cmap('tab20').colors
+        )
+    ax.legend()
     ax.set_title('Cluster sizes')
     ax.set_xlabel('Year')
     ax.set_ylabel('Number of publications')
-    ax.axhline(0, color="black", ls="--")
-
+    ax.yaxis.set_ticklabels([])
+    ax.xaxis.grid(color='gray')
+    temp = ax.xaxis.get_ticklabels()
+    temp = list(set(temp) - set(temp[::showNthGrid]))
+    for label in temp:
+        label.set_visible(False)
+    ax.set_axisbelow(True)
+    #plt.show()
     return fig
diff --git a/tox.ini b/tox.ini
index ea55a51..389f70c 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py37,py38
+envlist = py39
 isolated_build = True
 
 [pytest]

From 8a1208f68f7181a29f6cb97366dfb8c65696750f Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Fri, 17 Dec 2021 15:08:44 +0100
Subject: [PATCH 15/53] upd origin, wip on reports multiprocessing

---
 src/semanticlayertools/clustering/leiden.py  |  9 ++--
 src/semanticlayertools/clustering/reports.py | 50 +++++++++++++++-----
 src/semanticlayertools/visual/utils.py       |  1 -
 3 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py
index 31b43ea..f0045ad 100644
--- a/src/semanticlayertools/clustering/leiden.py
+++ b/src/semanticlayertools/clustering/leiden.py
@@ -18,6 +18,7 @@ def __init__(
         self, inpath: str, outpath: str,
         resolution: float = 0.003, intersliceCoupling: float = 0.4,
         timerange: tuple = (1945, 2005),
+
         debug: debugVar = False
     ):
         starttime = time.time()
@@ -33,7 +34,7 @@ def __init__(
             f'timeclusters_{timerange[0]}-{timerange[1]}_res_{resolution}_intersl_{intersliceCoupling}.csv'
         )
         if os.path.isfile(self.outfile):
-            raise OSError('Output file exists. Please remove.')
+            raise OSError(f'Output file at {self.outfile} exists. Aborting.')
 
         edgefiles = [x for x in os.listdir(inpath) if x.endswith('_GC.net')]
 
@@ -57,7 +58,7 @@ def __init__(
             f"loaded in {time.time() - starttime} seconds."
         )
 
-    def optimize(self):
+    def optimize(self, clusterSizeCompare: int=1000):
         """Optimize clusters accross time slices."""
         starttime = time.time()
 
@@ -109,10 +110,10 @@ def optimize(self):
                 outfile.write(
                     f"{elem[0]},{elem[1]},{elem[2]}\n"
                 )
-
+        largeclu = [(x,len(x.vs)) for x in subgraphs if len(x.vs)>clusterSizeCompare]
         print(
             f'Finished in {time.time() - starttime} seconds.'
-            f"Found {len(subgraphs)} clusters."
+            f"Found {len(subgraphs)} clusters, with {len(largeclu)} larger then {clusterSizeCompare} nodes."
         )
 
         return commun
diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py
index 5110510..40b7b35 100644
--- a/src/semanticlayertools/clustering/reports.py
+++ b/src/semanticlayertools/clustering/reports.py
@@ -6,6 +6,7 @@
 import textacy
 import textacy.tm
 import pandas as pd
+import numpy as np
 import multiprocessing
 
 num_processes = multiprocessing.cpu_count()
@@ -17,22 +18,28 @@
 class ClusterReports():
 
     def __init__(
-        self, infile:str, metadatapath:str, outpath:str, iteration: int,
+        self, infile:str, metadatapath:str, outpath:str,
         numberProc: int=num_processes, minClusterSize: int=1000
     ):
-        self.iteration = iteration
         self.numberProc = numberProc
         self.minClusterSize = minClusterSize
         self.metadatapath = metadatapath
-        self.outpath = outpath
-
         self.clusterdf = pd.read_csv(infile)
         basedata = self.clusterdf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index()
-        largeClusterList = list(basedata.groupby('cluster').sum().query(f'counts > {self.minClusterSize}').index)
-
+        largeClusterList = list(
+            basedata.groupby('cluster').sum().query(f'counts > {self.minClusterSize}').index
+        )
         self.clusternodes = self.clusterdf.query(
             'cluster in @largeClusterList'
         )
+        outfolder = infile.split(os.path.sep)[-1].split('.')[0]
+        self.outpath = os.path.join(outpath, outfolder)
+        if os.path.isdir(self.outpath):
+            raise OSError(f'Output folder {self.outpath} exists. Aborting.')
+        else:
+            os.mkdir(self.outpath)
+            for clu in largeClusterList:
+                os.mkdir(os.path.join(self.outpath, f'Cluster_{clu}'))
 
     def create_corpus(self, dataframe):
         """Create corpus out of dataframe."""
@@ -161,9 +168,28 @@ def fullReport(self, cluster):
             )
             return cluster
 
-    def processClusters(self, publicationIDcolumn: str='nodeID'):
-        for filename in tqdm(os.listdir(self.metadatapath)):
-            filepath = os.path.join(self.metadatapath, filename)
-            data = pd.read_json(filepath, lines=True)
-            selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner')
-            selectMerge.to_json(os.path.join(self.outpath, 'merge_' + filename) , orient='records', lines=True)
+    def _mergeData(self, filename, publicationIDcolumn: str='nodeID'):
+        filepath = os.path.join(self.metadatapath, filename)
+        data = pd.read_json(filepath, lines=True)
+        selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner')
+        if selectMerge.shape[0]>0:
+            for clu, g0 in selectMerge.groupby('cluster'):
+                g0.to_json(os.path.join(self.outpath, f'Cluster_{clu}', 'merged_' + filename) , orient='records', lines=True)
+        self.pbar.update(1)
+        return
+
+    def gatherClusterMetadata(self):
+        filenames = os.listdir(self.metadatapath)
+        #chunk_size = int(len(filenames) / self.numberProc)
+        #chunks = np.array_split(filenames, chunk_size)
+        self.pbar = tqdm(len(filenames))
+        pool = multiprocessing.Pool(self.numberProc)
+        result = pool.map(self._mergeData, filenames, chunksize=int(len(filenames) / self.numberProc))
+        return
+
+            # filepath = os.path.join(self.metadatapath, filename)
+            # data = pd.read_json(filepath, lines=True)
+            # selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner')
+            # if selectMerge.shape[0]>0:
+            #     for clu, g0 in selectMerge.groupby('cluster'):
+            #         g0.to_json(os.path.join(self.outpath, f'Cluster_{clu}', 'merged_' + filename) , orient='records', lines=True)
diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py
index 0f79bdf..b7fc4aa 100644
--- a/src/semanticlayertools/visual/utils.py
+++ b/src/semanticlayertools/visual/utils.py
@@ -72,5 +72,4 @@ def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000
     for label in temp:
         label.set_visible(False)
     ax.set_axisbelow(True)
-    #plt.show()
     return fig

From 3f6769f01b8ea06283154a50ec02af9420ade5f7 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 20 Dec 2021 16:24:38 +0100
Subject: [PATCH 16/53] finish mp reporting, add pipeline

---
 src/semanticlayertools/clustering/leiden.py   |   3 +-
 src/semanticlayertools/clustering/reports.py  | 211 +++++++++---------
 src/semanticlayertools/linkage/cocitation.py  |  12 +-
 .../pipelines/cocitetimeclusters.py           |  50 +++++
 4 files changed, 163 insertions(+), 113 deletions(-)
 create mode 100644 src/semanticlayertools/pipelines/cocitetimeclusters.py

diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py
index f0045ad..4721783 100644
--- a/src/semanticlayertools/clustering/leiden.py
+++ b/src/semanticlayertools/clustering/leiden.py
@@ -18,7 +18,6 @@ def __init__(
         self, inpath: str, outpath: str,
         resolution: float = 0.003, intersliceCoupling: float = 0.4,
         timerange: tuple = (1945, 2005),
-
         debug: debugVar = False
     ):
         starttime = time.time()
@@ -116,4 +115,4 @@ def optimize(self, clusterSizeCompare: int=1000):
             f"Found {len(subgraphs)} clusters, with {len(largeclu)} larger then {clusterSizeCompare} nodes."
         )
 
-        return commun
+        return self.outfile, commun
diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py
index 40b7b35..cb3cca3 100644
--- a/src/semanticlayertools/clustering/reports.py
+++ b/src/semanticlayertools/clustering/reports.py
@@ -1,5 +1,8 @@
 import re
 import os
+import time
+import multiprocessing
+from collections import Counter
 from tqdm import tqdm
 
 import spacy
@@ -7,7 +10,7 @@
 import textacy.tm
 import pandas as pd
 import numpy as np
-import multiprocessing
+import warnings
 
 num_processes = multiprocessing.cpu_count()
 
@@ -18,34 +21,36 @@
 class ClusterReports():
 
     def __init__(
-        self, infile:str, metadatapath:str, outpath:str,
-        numberProc: int=num_processes, minClusterSize: int=1000
+        self, infile: str, metadatapath: str, outpath: str,
+        numberProc: int = num_processes, minClusterSize: int = 1000,
+        timerange: tuple = (1945, 2005)
     ):
         self.numberProc = numberProc
         self.minClusterSize = minClusterSize
         self.metadatapath = metadatapath
-        self.clusterdf = pd.read_csv(infile)
-        basedata = self.clusterdf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index()
-        largeClusterList = list(
+        clusterdf = pd.read_csv(infile)
+        basedata = clusterdf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index()
+        self.largeClusterList = list(
             basedata.groupby('cluster').sum().query(f'counts > {self.minClusterSize}').index
         )
-        self.clusternodes = self.clusterdf.query(
-            'cluster in @largeClusterList'
+        self.clusternodes = clusterdf.query(
+            'cluster in @self.largeClusterList'
         )
-        outfolder = infile.split(os.path.sep)[-1].split('.')[0]
+        outfolder = infile.split(os.path.sep)[-1][:-4]
+        self.timerange = timerange
         self.outpath = os.path.join(outpath, outfolder)
         if os.path.isdir(self.outpath):
             raise OSError(f'Output folder {self.outpath} exists. Aborting.')
         else:
             os.mkdir(self.outpath)
-            for clu in largeClusterList:
+            for clu in self.largeClusterList:
                 os.mkdir(os.path.join(self.outpath, f'Cluster_{clu}'))
 
     def create_corpus(self, dataframe):
         """Create corpus out of dataframe."""
         docs = []
         titles = [x[0] for x in dataframe.title.values if type(x) == list]
-        for title in tqdm(titles):
+        for title in tqdm(titles, leave=False):
             try:
                 # text pre-processing
                 title = re.sub("\n", " ", title)
@@ -66,8 +71,9 @@ def create_corpus(self, dataframe):
         corpus_titles = textacy.Corpus(mainLanguageCorp, data=docs)
         return corpus_titles
 
-
-    def find_topics(self, corpus_titles: list, n_topics: int, top_words: int, outpath:str='./', writeReport: bool=False):
+    def find_topics(
+        self, corpus_titles: list, n_topics: int, top_words: int,
+    ):
         """Calculate topics in corpus."""
         vectorizer = textacy.representations.vectorizers.Vectorizer(
             tf_type="linear",
@@ -86,110 +92,103 @@ def find_topics(self, corpus_titles: list, n_topics: int, top_words: int, outpat
         model = textacy.tm.TopicModel("nmf", n_topics)
         model.fit(doc_term_matrix)
 
-        doc_topic_matrix = model.transform(doc_term_matrix)
-
         topics = []
         for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_words):
             topics.append("topic " + str(topic_idx) + ": " + "   ".join(top_terms))
-        if writeReport is True:
-            outfile.write(f'\n\n\tTopics in cluster for {n_topics} topics:\n')
-            for topic in topics:
-                outfile.write(f'\t\t{topic}\n')
-        else:
-            print("\nTopics in the cluster:\n")
-            return topics
+        outtext = f'\n\n\tTopics in cluster for {n_topics} topics:\n'
+        for topic in topics:
+            outtext += f'\t\t{topic}\n'
+        return outtext
 
     def fullReport(self, cluster):
         """Generate full cluster report."""
-        with open(f'{outFolderReports}Report_{cluster}.txt', 'a') as outfile:
-            selection = self.clusterdf.query('cluster == @cluster')
-            nodeList = list(set(selection.node.values))
-            starttime = time.time()
-            result = {}
-            resultMeta = []
-            for key, vals in groupby(sorted(nodeList), lambda x: x[:4]):
-                result[int(key)] = list(vals)
-
-            for key in result.keys():
-                if key > 1949 and key < 2005 and key != 1996:
-                    yeardata = pd.read_json(f'{inFolderMetadata}{key}_meta.json', lines=True)
-                    selectNodedata = yeardata[yeardata.nodeID.isin(result[key])]
-                    resultMeta.append(selectNodedata)
-
-            metadata = pd.concat(resultMeta)
-            metadata.to_json(
-                f'{outFolderReports}meta/cluster_{cluster}_meta.json',
-                orient='records',
-                lines=True
-            )
-            foundNodes = [x[0] for x in metadata.bibcode.values]
-            notFound = [x for x in nodeList if x not in foundNodes]
-
-            outfile.write(
-                f'\tGot {len(nodeList)} unique publications in time range:\
-                 {selection.year.min()} to {selection.year.max()}.\n'
-            )
-            outfile.write(
-                f'\t\tFound metadata for {metadata.shape[0]} publications.\n'
-            )
-            outfile.write(
-                f'\t\tThere are {len([x for x in foundNodes if x not in nodeList])}\
-                found publications which where NOT in the query list.\n'
-            )
-            outfile.write(
-                f'\t\tThere are {len(notFound)} publication(s) which where NOT found:\n'
-            )
-
-            topAuthors = Counter(
-                [x for y in [x for x in metadata.author.values if type(x) == list] for x in y]
-            ).most_common(20)
-            outfile.write('\n\tThe top authors of this cluster are:\n')
-            for elem in topAuthors:
-                outfile.write(f'\t\t{elem[0]}: {elem[1]} pubs\n.')
-            topAffils = Counter(
-                [x for y in [x for x in metadata.aff.values if type(x) == list] for x in y]
-            ).most_common(20)
-            outfile.write('\n\tThe top 20 affiliations of this cluster are:\n')
-            for elem in topAffils:
-                outfile.write(f'\t\t{elem[0]}: {elem[1]} authors.\n')
-            outfile.write(
-                f'\n\n\tFinished analysis of cluster {cluster} with {len(nodeList)}\
-                 unique publications in {time.time()- starttime} seconds.\n\n'
-            )
-            corpus = create_corpus(metadata)
-            find_topics(
-                corpus, n_topics=15, top_words=10, writeReport=True, outfile=outfile
-            )
-            find_topics(
-                corpus, n_topics=50, top_words=10, writeReport=True, outfile=outfile
-            )
-            outfile.write(
-                f'\n\n\tFinished analysis of topics in {cluster} in {time.time()- starttime} seconds.\n\n'
-            )
-            return cluster
-
-    def _mergeData(self, filename, publicationIDcolumn: str='nodeID'):
+        starttime = time.time()
+        clusterpath = os.path.join(self.outpath, f'Cluster_{cluster}')
+        clusterfiles = os.listdir(clusterpath)
+        clusterdf = []
+        for x in clusterfiles:
+            try:
+                clusterdf.append(
+                    pd.read_json(os.path.join(clusterpath, x), lines=True)
+                )
+            except ValueError:
+                raise
+        dfCluster = pd.concat(clusterdf, ignore_index=True)
+        basedf = self.clusternodes.query('cluster == @cluster')
+        inputnodes = basedf.node.values
+        foundNodes = [x[0] for x in dfCluster.bibcode.values]
+        notFound = [x for x in inputnodes if x not in foundNodes]
+        topAuthors = Counter(
+            [x for y in [x for x in dfCluster.author.values if type(x) == list] for x in y]
+        ).most_common(20)
+        authortext = ''
+        for x in topAuthors:
+            authortext += f'\t{x[0]}: {x[1]}\n'
+        topAffils = Counter(
+            [x for y in [x for x in dfCluster.aff.values if type(x) == list] for x in y]
+        ).most_common(21)
+        affiltext = ''
+        for x in topAffils[1:]:
+            affiltext += f'\t{x[0]}: {x[1]}\n'
+        corpus = self.create_corpus(dfCluster)
+        warnings.simplefilter(action='ignore', category=FutureWarning)
+        topics_15 = self.find_topics(corpus, n_topics=15, top_words=20)
+        topics_50 = self.find_topics(corpus, n_topics=50, top_words=20)
+        outtext = f"""Report for Cluster {cluster}
+
+Got {len(inputnodes)} unique publications in time range: {basedf.year.min()} to {basedf.year.max()}.
+    Found metadata for {dfCluster.shape[0]} publications.
+    There are {len(notFound)} publications without metadata.
+
+    The top 20 authors of this cluster are:
+    {authortext}
+
+    The top 20 affiliations of this cluster are:
+    {affiltext}
+
+    {topics_15}
+
+    {topics_50}
+
+Finished analysis of cluster {cluster} in {time.time()- starttime} seconds."""
+        return outtext
+
+    def _mergeData(self, filename, publicationIDcolumn: str = 'nodeID'):
         filepath = os.path.join(self.metadatapath, filename)
         data = pd.read_json(filepath, lines=True)
-        selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner')
-        if selectMerge.shape[0]>0:
+        selectMerge = data.merge(
+            self.clusternodes,
+            left_on=publicationIDcolumn,
+            right_on='node',
+            how='inner'
+        )
+        if selectMerge.shape[0] > 0:
             for clu, g0 in selectMerge.groupby('cluster'):
-                g0.to_json(os.path.join(self.outpath, f'Cluster_{clu}', 'merged_' + filename) , orient='records', lines=True)
-        self.pbar.update(1)
-        return
+                g0.to_json(
+                    os.path.join(
+                        self.outpath,
+                        f'Cluster_{clu}',
+                        'merged_' + filename
+                    ), orient='records', lines=True
+                )
+        return ''
 
     def gatherClusterMetadata(self):
         filenames = os.listdir(self.metadatapath)
-        #chunk_size = int(len(filenames) / self.numberProc)
-        #chunks = np.array_split(filenames, chunk_size)
-        self.pbar = tqdm(len(filenames))
-        pool = multiprocessing.Pool(self.numberProc)
-        result = pool.map(self._mergeData, filenames, chunksize=int(len(filenames) / self.numberProc))
+        yearFiles = []
+        for x in filenames:
+            try:
+                year = int(re.findall(r'\d{4}', x)[0])
+            except:
+                raise
+            if self.timerange[0] <= year <= self.timerange[1]:
+                yearFiles.append(x)
+        with multiprocessing.Pool(self.numberProc) as pool:
+            _ = pool.map(self._mergeData, tqdm(yearFiles, leave=False))
         return
 
-            # filepath = os.path.join(self.metadatapath, filename)
-            # data = pd.read_json(filepath, lines=True)
-            # selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner')
-            # if selectMerge.shape[0]>0:
-            #     for clu, g0 in selectMerge.groupby('cluster'):
-            #         g0.to_json(os.path.join(self.outpath, f'Cluster_{clu}', 'merged_' + filename) , orient='records', lines=True)
+    def writeReports(self):
+        for cluster in tqdm(self.largeClusterList):
+            outtext = self.fullReport(cluster)
+            with open(f'{self.outpath}Cluster_{cluster}.txt', 'w') as file:
+                file.write(outtext)
diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py
index a78a745..143b0df 100644
--- a/src/semanticlayertools/linkage/cocitation.py
+++ b/src/semanticlayertools/linkage/cocitation.py
@@ -16,12 +16,14 @@
 limitRefLength = TypeVar('limitRefLength', bool, int)
 debugVar = TypeVar('debugVar', bool, str)
 
+
 class Cocitations():
     """Create cocitation networks."""
 
     def __init__(
         self, inpath, outpath, columnName,
-        numberProc: int=num_processes, limitRefLength: limitRefLength=False, debug: debugVar=False,
+        numberProc: int = num_processes, limitRefLength: limitRefLength = False,
+        debug: debugVar = False,
     ):
         self.inpath = inpath
         self.outpath = outpath
@@ -35,7 +37,7 @@ def getCombinations(self, chunk):
         res = []
         if type(self.limitRefLength) == int:
             reflen = chunk[self.columnName].apply(
-                lambda x: True if type(x)==list and len(x)<=self.limitRefLength else False
+                lambda x: True if type(x) == list and len(x) <= self.limitRefLength else False
             )
             data = chunk[reflen].copy()
         else:
@@ -65,7 +67,7 @@ def calculateCoCitation(self, filepath):
             sortedComponents = sorted(
                 [(x, len(x), len(x)*100/len(tempG.vs)) for x in components], key=lambda x: x[1], reverse=True
             )
-            with open(os.path.join(self.outpath,infilename + '_graphMetadata.txt'), 'w') as outfile:
+            with open(os.path.join(self.outpath, infilename + '_graphMetadata.txt'), 'w') as outfile:
                 outfile.write(f'Graph derived from {filepath}\nSummary:\n')
                 outfile.write(tempG.summary() + '\n\nComponents (ordered by size):\n\n')
                 for idx, elem in enumerate(sortedComponents):
@@ -76,9 +78,9 @@ def calculateCoCitation(self, filepath):
             giantComponent = sortedComponents[0]
             giantComponentGraph = tempG.vs.select(giantComponent[0]).subgraph()
             giantComponentGraph.write_pajek(
-                os.path.join(self.outpath,infilename + '_GC.net')
+                os.path.join(self.outpath, infilename + '_GC.net')
             )
-            with open(os.path.join(self.outpath,infilename + '.ncol'), 'w') as outfile:
+            with open(os.path.join(self.outpath, infilename + '.ncol'), 'w') as outfile:
                 for edge in sortCoCitCounts:
                     outfile.write(f"{edge[0]} {edge[1]} {edge[2]}\n")
         except:
diff --git a/src/semanticlayertools/pipelines/cocitetimeclusters.py b/src/semanticlayertools/pipelines/cocitetimeclusters.py
new file mode 100644
index 0000000..8ce6027
--- /dev/null
+++ b/src/semanticlayertools/pipelines/cocitetimeclusters.py
@@ -0,0 +1,50 @@
+"""Runs all steps to create reports for cocite temporal network clustering."""
+import tempfile
+from datetime import datetime
+import os
+import multiprocessing
+
+from ..linkage.cocitation import Cocitations
+from ..clustering.leiden import TimeCluster
+from ..clustering.reports import ClusterReports
+
+num_processes = multiprocessing.cpu_count()
+
+
+def run(
+    basepath,
+    cociteOutpath,
+    timeclusterOutpath,
+    reportsOutpath,
+    resolution,
+    intersliceCoupling,
+    minClusterSize: int = 1000,
+    timerange=(1945, 2005),
+    referenceColumnName: str = 'reference',
+    numberproc: int = num_processes,
+    limitRefLength=False, debug=False
+):
+    cocites = Cocitations(
+        basepath, cociteOutpath, referenceColumnName,  limitRefLength, debug
+    )
+    cocites.processFolder()
+    timeclusters = TimeCluster(
+        inpath=cociteOutpath,
+        outpath=timeclusterOutpath,
+        resolution=resolution,
+        intersliceCoupling=intersliceCoupling,
+        timerange=timerange,
+        debug=debug
+    )
+    timeclfile, _ = timeclusters.optimize()
+    clusterreports = ClusterReports(
+        infile=timeclfile,
+        metadatapath=basepath,
+        outpath=reportsOutpath,
+        numberProc=numberproc,
+        minClusterSize=minClusterSize,
+        timerange=(timerange[0], timerange[1] + 3)
+    )
+    clusterreports.gatherClusterMetadata()
+    clusterreports.writeReports()
+    print('Done')

From c4b60dfd2406871d76ea5a9ee7a51679c635309d Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Tue, 21 Dec 2021 15:00:32 +0100
Subject: [PATCH 17/53] finish pipeline, minor improv of reportings

---
 src/semanticlayertools/clustering/leiden.py   |  8 ++--
 src/semanticlayertools/clustering/reports.py  | 17 +++++---
 src/semanticlayertools/linkage/cocitation.py  | 43 +++++++++++++++----
 .../pipelines/cocitetimeclusters.py           | 32 ++++++++------
 4 files changed, 69 insertions(+), 31 deletions(-)

diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py
index 4721783..c695d21 100644
--- a/src/semanticlayertools/clustering/leiden.py
+++ b/src/semanticlayertools/clustering/leiden.py
@@ -39,7 +39,7 @@ def __init__(
 
         self.graphDict = {}
 
-        for idx in tqdm(range(len(edgefiles))):
+        for idx in tqdm(range(len(edgefiles)), leave=False):
             try:
                 year = re.findall(r'\d{4}', edgefiles[idx])[0]
             except:
@@ -66,7 +66,7 @@ def optimize(self, clusterSizeCompare: int=1000):
             interslice_weight=self.interslice_param,
             vertex_id_attr='name'
         )
-        print('\tHave set layers.')
+        print('\tSet layers.')
 
         partitions = [
             la.CPMVertexPartition(
@@ -76,7 +76,7 @@ def optimize(self, clusterSizeCompare: int=1000):
                 resolution_parameter=self.res_param
             ) for H in layers
         ]
-        print('\tHave set partitions.')
+        print('\tSet partitions.')
 
         interslice_partition = la.CPMVertexPartition(
             interslice_layer,
@@ -84,7 +84,7 @@ def optimize(self, clusterSizeCompare: int=1000):
             node_sizes='node_size',
             weights='weight'
         )
-        print('\tHave set interslice partions.')
+        print('\tSet interslice partions.')
 
         self.optimiser.optimise_partition_multiplex(
             partitions + [interslice_partition]
diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py
index cb3cca3..a4b27c6 100644
--- a/src/semanticlayertools/clustering/reports.py
+++ b/src/semanticlayertools/clustering/reports.py
@@ -100,7 +100,9 @@ def find_topics(
             outtext += f'\t\t{topic}\n'
         return outtext
 
-    def fullReport(self, cluster):
+    def fullReport(self, cluster, authorColumnName: str = 'author',
+        affiliationColumnName: str = 'aff'
+    ):
         """Generate full cluster report."""
         starttime = time.time()
         clusterpath = os.path.join(self.outpath, f'Cluster_{cluster}')
@@ -115,21 +117,21 @@ def fullReport(self, cluster):
                 raise
         dfCluster = pd.concat(clusterdf, ignore_index=True)
         basedf = self.clusternodes.query('cluster == @cluster')
-        inputnodes = basedf.node.values
-        foundNodes = [x[0] for x in dfCluster.bibcode.values]
-        notFound = [x for x in inputnodes if x not in foundNodes]
+        inputnodes = set(basedf.node.values)
+        notFound = inputnodes.difference(set(dfCluster.nodeID.values))
         topAuthors = Counter(
-            [x for y in [x for x in dfCluster.author.values if type(x) == list] for x in y]
+            [x for y in dfCluster[authorColumnName].fillna('').values for x in y]
         ).most_common(20)
         authortext = ''
         for x in topAuthors:
             authortext += f'\t{x[0]}: {x[1]}\n'
         topAffils = Counter(
-            [x for y in [x for x in dfCluster.aff.values if type(x) == list] for x in y]
+            [x for y in dfCluster[affiliationColumnName].fillna('').values for x in y]
         ).most_common(21)
         affiltext = ''
         for x in topAffils[1:]:
             affiltext += f'\t{x[0]}: {x[1]}\n'
+        print(f'\tFinished base report for cluster {cluster}.')
         corpus = self.create_corpus(dfCluster)
         warnings.simplefilter(action='ignore', category=FutureWarning)
         topics_15 = self.find_topics(corpus, n_topics=15, top_words=20)
@@ -151,6 +153,7 @@ def fullReport(self, cluster):
     {topics_50}
 
 Finished analysis of cluster {cluster} in {time.time()- starttime} seconds."""
+        print('\t\tFinished topics.')
         return outtext
 
     def _mergeData(self, filename, publicationIDcolumn: str = 'nodeID'):
@@ -188,7 +191,7 @@ def gatherClusterMetadata(self):
         return
 
     def writeReports(self):
-        for cluster in tqdm(self.largeClusterList):
+        for cluster in tqdm(self.largeClusterList, leave=False):
             outtext = self.fullReport(cluster)
             with open(f'{self.outpath}Cluster_{cluster}.txt', 'w') as file:
                 file.write(outtext)
diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py
index 143b0df..98840fc 100644
--- a/src/semanticlayertools/linkage/cocitation.py
+++ b/src/semanticlayertools/linkage/cocitation.py
@@ -1,6 +1,7 @@
 """Link documents by cocitation."""
 import os
 import time
+import re
 import multiprocessing
 from itertools import combinations
 from collections import Counter
@@ -22,14 +23,17 @@ class Cocitations():
 
     def __init__(
         self, inpath, outpath, columnName,
-        numberProc: int = num_processes, limitRefLength: limitRefLength = False,
-        debug: debugVar = False,
+        numberProc: int = num_processes,
+        limitRefLength: limitRefLength = False,
+        timerange: tuple = (1945, 2005),
+        debug: debugVar = False
     ):
         self.inpath = inpath
         self.outpath = outpath
         self.columnName = columnName
         self.numberProc = numberProc
         self.limitRefLength = limitRefLength
+        self.timerange = timerange
         self.debug = debug
 
     def getCombinations(self, chunk):
@@ -75,6 +79,13 @@ def calculateCoCitation(self, filepath):
                     outfile.write(
                         f"{idx}:\n\t{elem[1]} nodes ({elem[2]:.3f}% of full graph)\n\t{len(gcompTemp.es)} edges ({len(gcompTemp.es)*100/len(tempG.es):.3f}% of full graph)\n\n"
                     )
+                    if idx == 0:
+                        gcouttuple = (
+                            elem[1],
+                            elem[2],
+                            len(gcompTemp.es),
+                            len(gcompTemp.es)*100/len(tempG.es)
+                        )
             giantComponent = sortedComponents[0]
             giantComponentGraph = tempG.vs.select(giantComponent[0]).subgraph()
             giantComponentGraph.write_pajek(
@@ -87,15 +98,31 @@ def calculateCoCitation(self, filepath):
             raise
         if self.debug == "l2":
             print(f'\tDone in {time.time() - starttime} seconds.')
-        return
+        return gcouttuple
 
     def processFolder(self):
         """Calculate cocitation for all files in folder."""
         starttime = time.time()
-        for file in tqdm(os.listdir(self.inpath)):
-            try:
-                self.calculateCoCitation(os.path.join(self.inpath, file))
-            except:
-                raise
+        with open(
+            os.path.join(
+                self.outpath, 'Giant_Component_properties.csv'
+            ), 'w'
+        ) as gcmetafile:
+            gcmetafile.write('year,nodes,nodespercent,edges,edgepercent\n')
+            for file in tqdm(os.listdir(self.inpath), leave=False):
+                try:
+                    year = re.findall(r'\d{4}', file)[0]
+                except:
+                    raise
+                if self.timerange[0] <= int(year) <= self.timerange[1]:
+                    try:
+                        outtuple = self.calculateCoCitation(
+                            os.path.join(self.inpath, file)
+                        )
+                        gcmetafile.write(
+                            f'{year},{outtuple[0]},{outtuple[1]},{outtuple[2]},{outtuple[3]}\n'
+                        )
+                    except:
+                        raise
         if self.debug is True:
             print(f'\tDone in {time.time() - starttime} seconds.')
diff --git a/src/semanticlayertools/pipelines/cocitetimeclusters.py b/src/semanticlayertools/pipelines/cocitetimeclusters.py
index 8ce6027..1762507 100644
--- a/src/semanticlayertools/pipelines/cocitetimeclusters.py
+++ b/src/semanticlayertools/pipelines/cocitetimeclusters.py
@@ -1,6 +1,5 @@
 """Runs all steps to create reports for cocite temporal network clustering."""
-import tempfile
-from datetime import datetime
+import time
 import os
 import multiprocessing
 
@@ -12,20 +11,29 @@
 
 
 def run(
-    basepath,
-    cociteOutpath,
-    timeclusterOutpath,
-    reportsOutpath,
-    resolution,
-    intersliceCoupling,
+    inputFilepath: str,
+    cociteOutpath: str,
+    timeclusterOutpath: str,
+    reportsOutpath: str,
+    resolution: float,
+    intersliceCoupling: float,
     minClusterSize: int = 1000,
-    timerange=(1945, 2005),
+    timerange: tuple = (1945, 2005),
     referenceColumnName: str = 'reference',
     numberproc: int = num_processes,
     limitRefLength=False, debug=False
 ):
+    for path in [cociteOutpath, timeclusterOutpath, reportsOutpath]:
+        os.makedirs(path)
+    starttime = time.time()
     cocites = Cocitations(
-        basepath, cociteOutpath, referenceColumnName,  limitRefLength, debug
+        inpath=inputFilepath,
+        outpath=cociteOutpath,
+        columnName=referenceColumnName,
+        numberProc=numberproc,
+        limitRefLength=limitRefLength,
+        timerange=timerange,
+        debug=debug
     )
     cocites.processFolder()
     timeclusters = TimeCluster(
@@ -39,7 +47,7 @@ def run(
     timeclfile, _ = timeclusters.optimize()
     clusterreports = ClusterReports(
         infile=timeclfile,
-        metadatapath=basepath,
+        metadatapath=inputFilepath,
         outpath=reportsOutpath,
         numberProc=numberproc,
         minClusterSize=minClusterSize,
@@ -47,4 +55,4 @@ def run(
     )
     clusterreports.gatherClusterMetadata()
     clusterreports.writeReports()
-    print('Done')
+    print(f'Done after {time.time() - starttime} seconds.')

From e4e81d9a75fc7469f6ea616cc23a9732554451b4 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Tue, 21 Dec 2021 16:19:05 +0100
Subject: [PATCH 18/53] add embedding utility fct

---
 setup.cfg                              | 21 ++++++++++-
 src/semanticlayertools/visual/utils.py | 52 +++++++++++++++++++++++++-
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 9a67afc..6621ead 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,7 +19,7 @@ classifiers =
 package_dir =
     = src
 packages = find:
-python_requires = >=3.7
+python_requires = >=3.8
 install_requires =
   tqdm
   matplotlib
@@ -33,5 +33,24 @@ install_requires =
   igraph
   leidenalg
 
+[options.extras_require]
+all =
+  %(embeddml)s
+  %(doc)s
+  %(dev)s
+  %(test)s
+doc =
+  sphinx
+dev =
+  twine
+  %(test)s
+test =
+  tox
+embeddml =
+  torch
+  umap-learn
+  sentence-transformers
+  plotly
+
 [options.packages.find]
 where = src
diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py
index b7fc4aa..4558e68 100644
--- a/src/semanticlayertools/visual/utils.py
+++ b/src/semanticlayertools/visual/utils.py
@@ -1,8 +1,18 @@
+import os
+from typing import TypeVar
+
 import matplotlib.pyplot as plt
 import pandas as pd
 import numpy as np
 from scipy import stats
-from typing import TypeVar
+
+from collections import Counter
+import plotly.express as px
+import plotly.graph_objects as go
+
+from sentence_transformers import SentenceTransformer, util
+import umap
+import torch
 
 smoothing = TypeVar('smoothing', bool, float)
 
@@ -73,3 +83,43 @@ def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000
         label.set_visible(False)
     ax.set_axisbelow(True)
     return fig
+
+
+def embeddedText(infolderpath: str, columnName: str, outpath: str):
+    """Create embedding for corpus text."""
+    print('Initializing embedder model.')
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    clusterfiles = os.listdir(infolderpath)
+    clusterdf = []
+    for x in clusterfiles:
+        try:
+            clusterdf.append(
+                pd.read_json(os.path.join(infolderpath, x), lines=True)
+            )
+        except ValueError:
+            raise
+    dataframe = pd.concat(clusterdf, ignore_index=True)
+    corpus = [x[0] for x in dataframe[columnName].fillna('').values if x]
+    print('Start embedding.')
+    corpus_embeddings = model.encode(
+        corpus,
+        convert_to_tensor=True
+    )
+    torch.save(
+        corpus_embeddings,
+        f'{os.path.join(outpath, "embeddedCorpus.pt")}'
+    )
+    print('\tDone\nStarting mapping to 2D.')
+    corpus_embeddings_2D = umap.UMAP(
+        n_neighbors=15,
+        n_components=2,
+        metric='cosine'
+    ).fit_transform(corpus_embeddings)
+    corpus_embeddings_2D.tofile(
+        f'{os.path.join(outpath, "embeddedCorpus_2d.csv")}',
+        sep=','
+    )
+    print('\tDone.')
+    dataframe.insert(0, 'x', corpus_embeddings_2D[:, 0])
+    dataframe.insert(0, 'y', corpus_embeddings_2D[:, 1])
+    return dataframe

From e1459bf3790384feca8876694190b92a3f17360e Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Wed, 22 Dec 2021 12:55:43 +0100
Subject: [PATCH 19/53] rm not necess imports

---
 src/semanticlayertools/visual/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py
index 4558e68..f2c8732 100644
--- a/src/semanticlayertools/visual/utils.py
+++ b/src/semanticlayertools/visual/utils.py
@@ -10,7 +10,7 @@
 import plotly.express as px
 import plotly.graph_objects as go
 
-from sentence_transformers import SentenceTransformer, util
+from sentence_transformers import SentenceTransformer
 import umap
 import torch
 

From 23434304af90578985b9093a9a9b064f57620ec6 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 3 Jan 2022 14:09:36 +0100
Subject: [PATCH 20/53] fix csv export of embeddings

---
 src/semanticlayertools/visual/utils.py | 40 ++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py
index f2c8732..63defe3 100644
--- a/src/semanticlayertools/visual/utils.py
+++ b/src/semanticlayertools/visual/utils.py
@@ -85,7 +85,7 @@ def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000
     return fig
 
 
-def embeddedText(infolderpath: str, columnName: str, outpath: str):
+def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str):
     """Create embedding for corpus text."""
     print('Initializing embedder model.')
     model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -99,7 +99,8 @@ def embeddedText(infolderpath: str, columnName: str, outpath: str):
         except ValueError:
             raise
     dataframe = pd.concat(clusterdf, ignore_index=True)
-    corpus = [x[0] for x in dataframe[columnName].fillna('').values if x]
+    dataframe = dataframe.dropna(subset=[columnName], axis=0)
+    corpus = [x[0] for x in dataframe[columnName].values if x]
     print('Start embedding.')
     corpus_embeddings = model.encode(
         corpus,
@@ -115,10 +116,37 @@ def embeddedText(infolderpath: str, columnName: str, outpath: str):
         n_components=2,
         metric='cosine'
     ).fit_transform(corpus_embeddings)
-    corpus_embeddings_2D.tofile(
-        f'{os.path.join(outpath, "embeddedCorpus_2d.csv")}',
-        sep=','
-    )
+    np.savetxt(os.path.join(outpath, "embeddedCorpus_2d.csv"), corpus_embeddings_2D, delimiter=',', newline='\n')
+    print('\tDone.')
+    dataframe.insert(0, 'x', corpus_embeddings_2D[:, 0])
+    dataframe.insert(0, 'y', corpus_embeddings_2D[:, 1])
+    return dataframe
+
+
+def embeddedTextClustering(infolderpath: str, columnName: str, emdeddingspath: str, outpath: str):
+    """Create clustering based on embedding for corpus texts."""
+    print('Initializing embedder model.')
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    clusterfiles = os.listdir(infolderpath)
+    clusterdf = []
+    for x in clusterfiles:
+        try:
+            clusterdf.append(
+                pd.read_json(os.path.join(infolderpath, x), lines=True)
+            )
+        except ValueError:
+            raise
+    dataframe = pd.concat(clusterdf, ignore_index=True)
+    corpus = [x[0] for x in dataframe[columnName].fillna('').values if x]
+    print('Loading embedding.')
+    corpus_embeddings = torch.load(embeddingspath)
+    print('\tDone\nStarting mapping to lower dimensions.')
+    corpus_embeddings = umap.UMAP(
+        n_neighbors=15,
+        n_components=50,
+        metric='cosine'
+    ).fit_transform(corpus_embeddings)
+    np.savetxt(os.path.join(outpath, "embeddedCorpus_50d.csv"), corpus_embeddings_2D, delimiter=',', newline='\n') 
     print('\tDone.')
     dataframe.insert(0, 'x', corpus_embeddings_2D[:, 0])
     dataframe.insert(0, 'y', corpus_embeddings_2D[:, 1])

From 9ed19a89cdadd99fe5d3a24452ca32c09cae8655 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 3 Jan 2022 19:17:19 +0100
Subject: [PATCH 21/53] add util for clustering

---
 setup.cfg                              |  1 +
 src/semanticlayertools/visual/utils.py | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 6621ead..a328a5e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -49,6 +49,7 @@ test =
 embeddml =
   torch
   umap-learn
+  hdbscan
   sentence-transformers
   plotly
 
diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py
index 63defe3..725fc30 100644
--- a/src/semanticlayertools/visual/utils.py
+++ b/src/semanticlayertools/visual/utils.py
@@ -12,6 +12,7 @@
 
 from sentence_transformers import SentenceTransformer
 import umap
+import hdbscan
 import torch
 
 smoothing = TypeVar('smoothing', bool, float)
@@ -126,7 +127,6 @@ def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str):
 def embeddedTextClustering(infolderpath: str, columnName: str, emdeddingspath: str, outpath: str):
     """Create clustering based on embedding for corpus texts."""
     print('Initializing embedder model.')
-    model = SentenceTransformer('all-MiniLM-L6-v2')
     clusterfiles = os.listdir(infolderpath)
     clusterdf = []
     for x in clusterfiles:
@@ -137,17 +137,22 @@ def embeddedTextClustering(infolderpath: str, columnName: str, emdeddingspath: s
         except ValueError:
             raise
     dataframe = pd.concat(clusterdf, ignore_index=True)
-    corpus = [x[0] for x in dataframe[columnName].fillna('').values if x]
+    dataframe = dataframe.dropna(subset=[columnName], axis=0)
+    corpus = [x[0] for x in dataframe[columnName].values if x]
     print('Loading embedding.')
     corpus_embeddings = torch.load(embeddingspath)
     print('\tDone\nStarting mapping to lower dimensions.')
-    corpus_embeddings = umap.UMAP(
+    corpus_embeddings_50D = umap.UMAP(
         n_neighbors=15,
         n_components=50,
         metric='cosine'
     ).fit_transform(corpus_embeddings)
-    np.savetxt(os.path.join(outpath, "embeddedCorpus_50d.csv"), corpus_embeddings_2D, delimiter=',', newline='\n') 
-    print('\tDone.')
-    dataframe.insert(0, 'x', corpus_embeddings_2D[:, 0])
-    dataframe.insert(0, 'y', corpus_embeddings_2D[:, 1])
+    np.savetxt(os.path.join(outpath, "embeddedCorpus_50d.csv"), corpus_embeddings_50D, delimiter=',', newline='\n')
+    print('\tDone.\nStarting clustering.')
+    cluster = hdbscan.HDBSCAN(
+        min_cluster_size=20,
+        metric='euclidean',
+        cluster_selection_method='eom'
+    ).fit(corpus_embeddings_50D)
+    dataframe.insert(0, 'label', cluster.labels_)
     return dataframe

From f8b297a4356a976d89c8fee04557d556cb9913a7 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Tue, 4 Jan 2022 13:55:46 +0100
Subject: [PATCH 22/53] improve docs

---
 docs/clustering.rst                          | 15 ++++++
 docs/index.rst                               |  6 ++-
 docs/linkage.rst                             |  5 ++
 docs/pipelines.rst                           | 11 +++++
 docs/visual.rst                              |  6 +++
 src/semanticlayertools/linkage/cocitation.py | 48 ++++++++++++++++++--
 src/semanticlayertools/linkage/wordscore.py  |  8 ++--
 src/semanticlayertools/visual/utils.py       | 21 +++++++--
 tox.ini                                      | 10 +++-
 9 files changed, 114 insertions(+), 16 deletions(-)
 create mode 100644 docs/clustering.rst
 create mode 100644 docs/pipelines.rst
 create mode 100644 docs/visual.rst

diff --git a/docs/clustering.rst b/docs/clustering.rst
new file mode 100644
index 0000000..62d1dc8
--- /dev/null
+++ b/docs/clustering.rst
@@ -0,0 +1,15 @@
+Clustering network data
+=======================
+
+.. automodule:: semanticlayertools.clustering.infomap
+   :members:
+   :undoc-members:
+
+
+.. automodule:: semanticlayertools.clustering.leiden
+  :members:
+  :undoc-members:
+
+.. automodule:: semanticlayertools.clustering.reports
+  :members:
+  :undoc-members:
diff --git a/docs/index.rst b/docs/index.rst
index 747681b..a445cda 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -12,8 +12,12 @@ This project collects tools to build semantic layers from text corpora.
    :maxdepth: 2
    :caption: Contents:
 
-   linkage
    cleaning
+   pipelines
+   linkage
+   clustering
+   visual
+
 
 
 
diff --git a/docs/linkage.rst b/docs/linkage.rst
index 441a5d1..3e65b98 100644
--- a/docs/linkage.rst
+++ b/docs/linkage.rst
@@ -4,3 +4,8 @@ Word scoring and linkage
 .. automodule:: semanticlayertools.linkage.wordscore
    :members:
    :undoc-members:
+
+
+.. automodule:: semanticlayertools.linkage.cocitation
+  :members:
+  :undoc-members:
diff --git a/docs/pipelines.rst b/docs/pipelines.rst
new file mode 100644
index 0000000..0c2ff90
--- /dev/null
+++ b/docs/pipelines.rst
@@ -0,0 +1,11 @@
+Pipelines for workflows
+=======================
+
+.. automodule:: semanticlayertools.pipelines.cocitetimeclusters
+   :members:
+   :undoc-members:
+
+
+.. automodule:: semanticlayertools.pipelines.wordscorenet
+  :members:
+  :undoc-members:
diff --git a/docs/visual.rst b/docs/visual.rst
new file mode 100644
index 0000000..29e6a31
--- /dev/null
+++ b/docs/visual.rst
@@ -0,0 +1,6 @@
+Utility functions for visualizations
+====================================
+
+.. automodule:: semanticlayertools.visual.utils
+   :members:
+   :undoc-members:
diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py
index 98840fc..2872b33 100644
--- a/src/semanticlayertools/linkage/cocitation.py
+++ b/src/semanticlayertools/linkage/cocitation.py
@@ -19,7 +19,33 @@
 
 
 class Cocitations():
-    """Create cocitation networks."""
+    """Create cocitation networks.
+
+    Calculates all combinations of all references of publications in given
+    corpus file(s). Can be limited for maximal number of references to consider
+    (e.g. papers with less then 200 references), to speed up creation of
+    networks.
+
+    For each corpus file, graphs are generated by the weighted cocitation tuples,
+    using the Igraph package. Information on obtained clusters are written to
+    '_graphMetadata.txt' files. The subgraph of the Giant component is saved in
+    Pajek format with the ending '_GC.net'. The full edge data is written in
+    edge-Format to a '.ncol' file.
+
+    :param inpath: Path for input data
+    :type inpath: str
+    :param outpath: Path for writing output data
+    :type outpath: str
+    :param columnName: Column name containing the references of a publication
+    :type columnName: str
+    :param numberProc: Number of CPUs the package is allowed to use (default=all)
+    :type numberProc: int
+    :param limitRefLength: Either False or integer giving the maximum number of references a considered publication is allowed to contain
+    :type limitRefLength: bool or int
+    :param timerange: Time range to consider (default=(1945,2005))
+    :type timerange: tuple
+    :param debug: False/True or l2 to show level 2 debugging messages
+    """
 
     def __init__(
         self, inpath, outpath, columnName,
@@ -37,7 +63,13 @@ def __init__(
         self.debug = debug
 
     def getCombinations(self, chunk):
-        """Calculate combinations."""
+        """Calculate combinations of references in publications chunk.
+
+        :param chunk: A chunk of the corpus dataframe
+        :type chunk: `pd.Dataframe`
+        :returns: A list of all reference combinations for each corpus entry
+        :rtype: list
+        """
         res = []
         if type(self.limitRefLength) == int:
             reflen = chunk[self.columnName].apply(
@@ -53,7 +85,17 @@ def getCombinations(self, chunk):
         return res
 
     def calculateCoCitation(self, filepath):
-        """Do calculation for input file."""
+        """Run calculation for single input file.
+
+        Creates three files: Metadata-File with all components information,
+        Giant component network data in pajek format and full graph data in
+        edgelist format.
+
+        :param filepath: Path for input corous
+        :type filepath: str
+        :returns: A tuple of GC information: Number of nodes and percentage of total, Number of edges and percentage of total
+        :rtype: tuple
+        """
         infilename = filepath.split(os.path.sep)[-1].split('.')[0]
         starttime = time.time()
         try:
diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py
index 84bdaa7..e36721b 100644
--- a/src/semanticlayertools/linkage/wordscore.py
+++ b/src/semanticlayertools/linkage/wordscore.py
@@ -1,6 +1,5 @@
 import os
-import re
-from collections import Counter, defaultdict
+from collections import Counter
 from itertools import islice, combinations
 from multiprocessing import Pool, cpu_count
 from tqdm import tqdm
@@ -61,7 +60,7 @@ def __init__(
         self.counts = {}
         self.corpussize = 1
         self.uniqueNGrams = ()
-        self.debug=debug
+        self.debug = debug
 
     def getTermPatterns(self):
         """Create dictionaries of occuring ngrams."""
@@ -151,8 +150,7 @@ class LinksOverTime():
     This class takes care of this, by adding new keys of authors, papers or
     ngrams to the register.
 
-    :param dataframe: Source dataframe containing metadata of texts
-    (authors, publicationID and year)
+    :param dataframe: Source dataframe containing metadata of texts (authors, publicationID and year)
     :type dataframe: class:`pandas.DataFrame`
     :param authorColumn: Column name for author information
     :param pubIDColumn: Column name to identify publications
diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py
index 725fc30..c49aef8 100644
--- a/src/semanticlayertools/visual/utils.py
+++ b/src/semanticlayertools/visual/utils.py
@@ -117,14 +117,21 @@ def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str):
         n_components=2,
         metric='cosine'
     ).fit_transform(corpus_embeddings)
-    np.savetxt(os.path.join(outpath, "embeddedCorpus_2d.csv"), corpus_embeddings_2D, delimiter=',', newline='\n')
+    np.savetxt(
+        os.path.join(outpath, "embeddedCorpus_2d.csv"),
+        corpus_embeddings_2D,
+        delimiter=',',
+        newline='\n'
+    )
     print('\tDone.')
     dataframe.insert(0, 'x', corpus_embeddings_2D[:, 0])
     dataframe.insert(0, 'y', corpus_embeddings_2D[:, 1])
     return dataframe
 
 
-def embeddedTextClustering(infolderpath: str, columnName: str, emdeddingspath: str, outpath: str):
+def embeddedTextClustering(
+    infolderpath: str, columnName: str, emdeddingspath: str, outpath: str
+):
     """Create clustering based on embedding for corpus texts."""
     print('Initializing embedder model.')
     clusterfiles = os.listdir(infolderpath)
@@ -138,16 +145,20 @@ def embeddedTextClustering(infolderpath: str, columnName: str, emdeddingspath: s
             raise
     dataframe = pd.concat(clusterdf, ignore_index=True)
     dataframe = dataframe.dropna(subset=[columnName], axis=0)
-    corpus = [x[0] for x in dataframe[columnName].values if x]
     print('Loading embedding.')
-    corpus_embeddings = torch.load(embeddingspath)
+    corpus_embeddings = torch.load(emdeddingspath)
     print('\tDone\nStarting mapping to lower dimensions.')
     corpus_embeddings_50D = umap.UMAP(
         n_neighbors=15,
         n_components=50,
         metric='cosine'
     ).fit_transform(corpus_embeddings)
-    np.savetxt(os.path.join(outpath, "embeddedCorpus_50d.csv"), corpus_embeddings_50D, delimiter=',', newline='\n')
+    np.savetxt(
+        os.path.join(outpath, "embeddedCorpus_50d.csv"),
+        corpus_embeddings_50D,
+        delimiter=',',
+        newline='\n'
+    )
     print('\tDone.\nStarting clustering.')
     cluster = hdbscan.HDBSCAN(
         min_cluster_size=20,
diff --git a/tox.ini b/tox.ini
index 389f70c..40887f1 100644
--- a/tox.ini
+++ b/tox.ini
@@ -17,9 +17,15 @@ commands = pytest {posargs}
 
 [testenv:docs]
 description = invoke sphinx-build to build the HTML docs
-basepython = python3.7
+basepython = python3.9
 deps =
-    sphinx >= 1.7.5, < 2
+    sphinx
     sphinx_rtd_theme
+    plotly
+    hdbscan
+    umap-learn
+    torch
+    sentence-transformers
+    https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0.tar.gz#egg=en_core_web_lg
 commands = sphinx-build -d "{toxworkdir}/docs_doctree" docs "{toxworkdir}/docs_out" --color -W -bhtml {posargs}
            python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))'

From 4849addc967b8076b40babfa56d4f9c720f34b88 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Wed, 5 Jan 2022 17:42:13 +0100
Subject: [PATCH 23/53] minor fixes

---
 src/semanticlayertools/visual/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py
index c49aef8..5952052 100644
--- a/src/semanticlayertools/visual/utils.py
+++ b/src/semanticlayertools/visual/utils.py
@@ -100,8 +100,8 @@ def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str):
         except ValueError:
             raise
     dataframe = pd.concat(clusterdf, ignore_index=True)
-    dataframe = dataframe.dropna(subset=[columnName], axis=0)
-    corpus = [x[0] for x in dataframe[columnName].values if x]
+    dataframe = dataframe.dropna(subset=[columnName], axis=0).reset_index(drop=True)
+    corpus = [x[0] for x in dataframe[columnName].values]
     print('Start embedding.')
     corpus_embeddings = model.encode(
         corpus,
@@ -144,7 +144,7 @@ def embeddedTextClustering(
         except ValueError:
             raise
     dataframe = pd.concat(clusterdf, ignore_index=True)
-    dataframe = dataframe.dropna(subset=[columnName], axis=0)
+    dataframe = dataframe.dropna(subset=[columnName], axis=0).reset_index(drop=True)
     print('Loading embedding.')
     corpus_embeddings = torch.load(emdeddingspath)
     print('\tDone\nStarting mapping to lower dimensions.')

From c97976f61d1c343ceac04e0b27e4ba821daa64f9 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 6 Jan 2022 13:11:24 +0100
Subject: [PATCH 24/53] add doc

---
 docs/_static/bmbf.png                        | Bin 0 -> 12534 bytes
 docs/_static/logo.png                        | Bin 0 -> 13387 bytes
 docs/_static/mpiwg.png                       | Bin 0 -> 37002 bytes
 docs/clustering.rst                          |   7 +-
 docs/conf.py                                 |   6 +-
 docs/index.rst                               |  16 ++-
 docs/linkage.rst                             |   5 +-
 src/semanticlayertools/clustering/infomap.py |  76 +++++++++---
 src/semanticlayertools/clustering/leiden.py  |  68 +++++++++--
 src/semanticlayertools/clustering/reports.py | 116 ++++++++++++++++---
 src/semanticlayertools/linkage/cocitation.py |   1 -
 11 files changed, 247 insertions(+), 48 deletions(-)
 create mode 100644 docs/_static/bmbf.png
 create mode 100644 docs/_static/logo.png
 create mode 100644 docs/_static/mpiwg.png

diff --git a/docs/_static/bmbf.png b/docs/_static/bmbf.png
new file mode 100644
index 0000000000000000000000000000000000000000..02558031aadb9fc757662041de5d4a5c54eaa3eb
GIT binary patch
literal 12534
zcmb_@WmHvR_vf|1KtMqRM38QfkPcDlQlz`3TRKGr3F$5+1*H2@0@4iv(#@qCX_$TA
znYCs<%&b|n{^MH62hTnCoacG=ulDxKYdP_|w~22f5Qw{y60a2C_ayw}-o${v=j!8H
z5QwL>lCMM*of9{byi_BVrY?0n9^H8Nx}DAa+ZTLa$3ld8FpF2=86C6uj!NK?5WQeP
zo`0V3z^jy}_WhP`HSwN`Ry-Zi$uOsjjd~Hlv~Y5BspmS~?W@|MZMt1I*#!>|aVocr
z5VEpjYrF+(@oV~4Jsz9(n^n_?yeNAil>KG|mE?;jxI96ZP8D~a;GXHc^p|^L^Yn$o
zRUhAbsdzU@f^d_b&**+!zJ3*x@)a#E4<R|9&##7?{kN`Ti~S!iCHdHG++<ubIl!x&
zd+=3f&2bNZf>+V08d2}?Q<?D*mxb=uRIS6-RMLo!SJKRkZcc8lTAe#@aBwi^vuA7$
z%dazK6EL2?d2^3ZCI7b}`%W8W$o8;?_4kpr?EHM=4V{Y*DfPm5v~t&<Ffs<_<S@|4
zCHfDUEwXwY-!U~c-QS$ls3LeSCWdh<SLLmB(Nl}%*Hc<$QoNoszpfBJdUS=5-;s$@
z{A>D;AO7#xhUpjWZEf`$KHP|mj4X6Jw1O*a?d<d$Ki=wirap2U%sutGXCrT@czvUS
zEuq_DqR{6{1Pwe&lpBZhw(iwy*Wj6Gc<H|k6tin;Y9f9V6rhe#DdLBHFB6&RXPY1D
zyvkFK2n!Q2Frb`o2{u6WB$4nr-1N@Oq&Yh~i{*E+dUsh`5p1%;7q>B95fK{t>ydR$
zLqUpbz?(F&H;RfMM=faB*uv^OPUki@y7yNGM{8U-lyjq8_LlU@)YYmTnXRm>u*if+
za)$y&EsUExqGaMZZ<UN<dH0(vl=KS0J{0LSVGIlm_y-2Qyv)^MW8{lZO{I{b!Nc(N
z*}<dP&JsTVk~TWpl`QzkyRq>_x%IT%^XCXbxBXwbPj%Vje&pp*!vkQt9k%rL;?!z~
zl$3Dr^72|t*CqY>^*k{t@#d{t<gBc{`8@{*2UIjPa|;W<zlr!B&2_TIalFHZr~9K>
z*Y)AX9mLVeT0tDQH7Snp*;9m8wWH|PT3|{F*?5I@77|G;EG&HTZ=-XhH6$V_sk_X4
z1i=%XBrD?SxhFVP9{e^-?jA0#H(VYf_%HUF=jgb2sY4Qa*I%d+MbO9@F0_SR$HW{Q
zYQOdF-8*7p;+wZ`Q}FQ-tEi|11O^@t8>ZIOB!2jC_3LB)&skZtJ#upB7uemkL%<GJ
zxgWEWxo=;wNA)zfw|`;LLLzKd|Ijx!HAR`5auUO~^4QEEe0+RfiHl#ueej?e`F<_Z
z?klNa{PMC%moW#ed~$kDP5`E+)~!2t7jc;NO=VS82?sM}Ve$L>{<ir9VZpa-447~d
zr>3R#uO0C4@)j9&5-2h}z@w3QLPhm)b=9nxo#RVLh{~?h`}a!f+b{jF**wpj7v1FK
zuwg5{nUCZn_C2>n^0jMkdq<RT^o9_A|Ffn2pG8p5YY~3pjU#jx`pkUTW@GFb0jqWp
zyrl5i_O+P0)307hLa>fPG!b7gT@U8`$UF->({dy5JSHY0=WE>#jhDLO(laylOGfb<
zvK1L#ym%oYA)(tH{|r%6W<f+sN-8z-Mr;!~YQdG5m`F!Y|NV(@N@*!4qNb+C#Ka<f
zz3~Pts_{q;f|HArZp6XKDXaA7PpwI2f;%Z5|A^qvy0o;x`rjf1)?>cN!oJs2=X)vb
zT^$rmOjuDl=hgMIt3!^9EG#TbYX$W*tgJW@?S@V%veTZ@+S;UQYHE@1UnjfWB1g&@
z?=SaLQB$*7PYGxzJ{4IBnDn|lCzPepJ}$VVSRZ`{RSP>FZ*ECiTCRz7_;L%622p4|
zErfL1%0IK-U1;wp({`S{AxbAFC&%5j*A_;3#zpZV`)$^JLPCE(Kg_<f(E5L4=0QaK
z(GM6^98D6dhX4NU;i0S$TN^1r4XNlX4wiCebY>}J`^U5SD&?x|?SB&y*!hje&6E%n
z5rLPInX$9m$+`zW5#ix4Dy$|E%L5sAsvS4x7Grd1V3*HN52qEAo$kQPWY*Osf5D?2
z`19xE)JV*w>*9lefB?ky(Ax2GS_qH*5|)m~!Grw#eBpSVm(q+S4l4>=)>A?4h6=B!
zbvpHnW>Bcs-1^IN+1dk23JN*~28z?=H_AUXC5?=z`zE3U9oGqCShVy3Y#x_OXeA}S
z&K+`WU9cT#t#-7XZM-osI2gxn6rf;zoWbVY7V3!74ryr-)7RJMaoQvi6&3y6+R8ZD
ziA3^grwFG2SWrnKQPpuTpx{O=mb$s8sTdjWL`gO8E_SlT%V?IV+d4XCx`mJmoNYI4
z{2tEJ;ASGYgHXxWY%pQ(FL@_l&6y@1vNVt(eP+3)42V&vo>z0Snrmoe^j-Ns$nWCf
zqFLj7@8QFTqCXK)QBgen{Ami=>qr|MWYui$2;)c{sa#Kw)H#&$&fZ?N-bpK&SK{Z-
z7-r_?SMYZ(RPRjOEB-c4QIRi<`H?&WZ)L2E5h|QX3mFLoDkDaDad9Egm-~<)@Ka|D
zYsFfz;6MfIHB9V<yoxFPfpp1$d%wH8gW+|}%*;&S&t|5mD4WNL!;7ZINI$Eg93@31
zCEJ{D-hg)#lasb^!oPj<e!!^QSMTMe@b)bczcCg<r9eAlX@DA$n3S|KP?oDN(cal<
z&~&;rc-nx8<`XCYhqdA0#6*(thzPbg8TaGGn6Bd{GQOXencWG135JG-xFjS$9oQE6
zDjt`VEh)$bPaftn8`MDQ!!nJ(Z^C&j)b+Bxos)|zb;u0%ucP}y2K#>?fbl=`f>}5e
z<DJnvy?C?|QZl~>2P2XN-PmaaNKnzrFI$F&q82-2t`%C0vahbL9?kgTyv|kxM9Qo=
zE7rg1GW+SF|CcZ1Po5xPW2?97I(>13+Td<sF)^)UW#&8k`{*0)?>hjKs8-og_X?ge
z-I;d2=6!K~YC!nxwh7n3t5>gd%hX-Isj#+u^m}WwiobGtxCeFfMn|XDAVjd`t-O#V
zurln-`g8@(r8lCQud=cd_ujpRDtpx55;yJ>o%4YDi-Q8Gz<emOpdiJTuaEEDyXTXY
zm9?u?FoK7R`>jTEzpl>tU<>Eh_H5J2WHmS7OZwJyFL##Q<Pz%=;U=Cf>y4|e6zcz1
zk@EkbXa2ukT7|0UC|1vN*cek9RqhxXdQhew+-<yO6Y;`lMjMsvY38&!(LP%8Ze=*{
zQB_qH0P4(e77pP5%q`)1H&+~XpBdMQx2cSbjPsxb1WuOIDv`+D(RV3Du&antb!GsY
z)cpLfk&*b0>wm9sZZX8Cq{!EMp7*UCIF$+*m&64=WG(04+}!khtMTBaeUe-vPyZ}-
zDmgiMWmVOr^62f|J$hcJuc4u#2`ruR-+%rL@;aL*!+ON^v&mx-m@Y0ZPP5wa?&M1x
zHpAsUM^g)~M=n)Wkt&LPwQf!ye0|^K4n;;qg+)b)+Sn9MCOdGag2<Yfn9$%uC5s>f
zLEcHa{aX)<ZfTYTI=%noio)yHp-^4a)KpXwfd&Q{8Md|xrl#5WczBebK7E$*=A;qO
zgsKduprDA{tEsKcXf7%mfMcCLHa3=`S(lzpMZGng@|fRo!hsa-yz&(1xs<$o{YJ5<
zn3$A$o^nTbckPCN)u%sy{yd5vHQ^i^%FE8q)`~xipry99wqC605Oz@rr;;2iHI29~
zk-%k%T-lxI;gyw@4R6-3TG>x(*0Nk#U0ZY6C>x<;WyO3#N9WJN@PJh|S*Ad{mXOEh
zNm*H$W}SOnhIEWRdt6&*=d~GkQ+kG|0Lcg%y&~14i_<M$yM>$P({4E4^Q|G53j{N+
zZu-+b7sK$nM7(xXvU%nF*d`^uD|s_>b06fgnvS{tZPYzpaD$@fKf63XoW|iilz=th
zv7NgPi{@y`z`#h)!XlQ|-<PYxiY#B?wfOs#%W~{@p<a^}Qot7z#~7ps7OB8d=72_?
zve5AY9uRjdd|Xy1i=#3(Ra#Ci6V%YRZ%=^2{ey#>M@NZpaBx74Od{VK^rwmn?6y(j
z6A-}Od`o1!d=9ept>;7(ql(?xku3uQgKCNK-ACM3HH-(V{;i$|x!Bd3^_-23jXVy^
zxR~th>#yFGsUP#P;^@>SZwk8pO`4yd_YDc@W!4fe7?~chnFAu|O5hHmmX4knc+cls
zO@joAxqtuuO)RX1D=I3P@(IJ6Ey6&o%LlG?XS>p%dCU&$A8I>`9BYsoFsT-L$LM&*
zWhJ|Z!#U5Ci`y-6?z7~2G+u5=$-#jKB!Y35`1eA4#KzB(p`js*vC=Pls9p{z2<*Kg
z^*j;*=TN|YME9Y|Z~uEzYl>=W?LOBq2?bp_(RyV%k+0cHHsP~&L91EW?yMhKcuES+
zufIj;y;@)%S&Vy^{2g%q!e#Eda;{R9qlbs!c)9hvU1?UmlYbhw@7#&9XJBL;t#)Ef
z6ANwx$b@%8zXZstqkrSpydWe=H#cQ2CbuHrr9*wO)*XF7CA-NH4Js)s7u%!c<dXP4
z`Bn1+?j2!X4~0rnHm13I_wK^dQg~uwS7?K;?cuiWbe#tjWe~@^o=}&gr{v$iCklHd
zllx+hZ)9jIE4Kh9-@0|{xV_xBnoB+=Iay9t_O36+-8moaTIa%Rz8}={mH^&$LHI#k
z(6wH>c8z~M{$O)50gg6~n``Reio|4b21l-qG*m3u3@|<&nDKXU`}_N4lK38Y-!U?R
z-%$YD3@Bw46%Md$(M)PDw6q=rzJOi12O69jkj<bckvE7;I2ouFIEPT!Q$Tut4%i=&
z$VjhuvcCepB@v8_Ehm3MC-dPm_k9!N?s!qS&5)83Ar{8)Pc{3_gFj|vgkr%I+}ttN
zb;n}7o|_ZApaX7U-!dH$?e6V;2Uh_`T_R?>;yAkEpWV^X(Fwngmz|qyFgk9HCumP&
zWMqU$6!M5)8O*Yu`+e)%F}0B0g2Zrv4$iNsTDMz-JfQ}oqlp48;axwOK)7VpAAjz}
zAx&SH;0+R^9L$oZ1*PiS@T-5_f<?Qg?WbCagp{1}W&*t4;pQZKV<6bniCsa{PA$_F
zu(Ivl-8bTyeNeqAS;fW0O)Q_=4Jl}8TQ_4<Qgr+b0;OYE`VC8F7Z*iCHYa(blY-j~
zk@L#k<9eOZ%nK;g;NQPDIYYugM2UNP3enTkGX)3E=NWZAP3UIQtPK7A`xOC`syC>6
z^kHpoZl<E5+Fs2qP|()y`SB(V#LMg5%aN7A^z?LGGP2%8UVFsyXbEkua)Gishf)lS
z)QSAz=AsBUw*%i}Fa$6Ca7uW%qK&NU?|K&$uma~ODl0ph<kj5k6}}YMTk1yhjMT%u
zh!Eixm4epJ&Ri|m<BG`Ej@`o_Kj@}h*O&p^olCvE(AsljvD0x_%H?NVLV~o2$Q9rU
zaAXm1f+Zwwy1}<jD&;y~9HWSY+<EFxdoK;)aQqlDgEi_~aoAs;O6uy85fBjA-rd!Q
zr9k74x_W?i0SeRyFnmaZ?l+x!&k6@tzQ$SRBv^c?f%UO6X<h<c++V4pfj{!|tzs^E
zy)HObR#qHa;k=E-%>XRiAz+$W5%KMVQzT0hM1!Xxd?MjuFewXr>ECXWURW3k-cpa9
z_(~-A;cv+ZgbnyF(m&zh_Yuu~_+tZ7M@L6b4cl<5&+f=2U5&qsWmZO91{D61wPKYT
z0j4>D{H5O<ARM@`$;l+(6}3Veas)%vZGgXc>6pSY_Ol5ru*MzIEW*M%RflWSUMYQr
zg&Oy5RHa=(YZYMLRoLCSsphWZe!&ZP1#4!&NgQ`RX8ye7yEtiTsi)q^@bK>V^hGp8
zCGPI-Alqc(*grvnVFRV`?byh3d0S8HD~Ti!lGUQ4gMX6-a&Ar^y*Z0+un)j9$8bUj
z`5fo~wHH>hQpEC<?V0!!z=s!Vk%DB<2Ot3c*DD!Ca|}r2EMRRQT8l}fl4SM2+Cyin
zBO73M{(3v^nflAqdbD}i#qF?ue@w0pWPtN;Qx@z-TTc&0JlRczv~)bC`uPl4I>+PA
z$*qI6k-cVA$~Bv%bv4B5n8TNo!!7H>NzkJTGiX-@NbT?@`)RxkA=c;dd}|pQEb$Oh
zC2c}b>mt4w-xad?7O8oF;y_fE=r2kMT`b_i^S(4RTzH?C#|~E|TmGEr6+9$CeERe$
zh=eyBAoVK3OSqnY?#5zYbdPb#y=s$ti;Ig#r>ANCCV$J!AGX@rsa9A$rW6YT^L;&7
ztf@p#FR1Ek148%Dk2eC32tMcLGWH7WVdb1|p*mUX(^HGX!S7w`HV_VMZUp*@B3QKQ
z|MTbix6&88A++`1L6Gs<|6|Iuu3yxXmj@rUjr^1^qt!`R1XR(JDA*ZS@(!fYD^by%
zzK~KDa9G!|NF+>6=>fnOpe!J)Nc|qXv#K1+X-jeA#tnBBEm>DrKE5Puc#ZL%t=8@B
zZQb#I8^C3^oSdA{AZcPkM-#D9zDtZp+5;$tj;5x|gZ=$))>!v9x(bqv4M3a}B0o3)
ztZLX1N&i!`>OMSt<EJ~e5O;yQmx+CPC9U5dz^Q%+Uql0d50GD<@L2$m;qLyRJi2gS
zz4{n#3?HsE;UwX6kImDlD0be_7gyXX_rIJcBicTie_n!Mjf|ofwu^}_Qe9(96yS=q
zT^q#ocG*L50LX$qgd+`TKMM(#wetpR{n>r*%1WMQ{kX)$vxrxyjd@?Wtod9@@wWE%
z*=*Vr#nlm_SQ(o9XI$)-W1Puuo4n;#lO5sI(no7c-8I!_L(GdA3fY9hdRL)p7j?4a
z?cwl4`B{&<{b0s6U+L)Rz>moXXRXhbuz@6ni>Mv@DS&ETIyi8y4d)sB^1cG*KgnrI
zZ*!u`*~JANj%CWkeQ4HybCCjK`A-lLf=R9T;|Bu!quP%GVn^kLxKUD{q%!p3(_OJ_
zzH_M*Gt(dV?3aS<v?ZlS-n5L2JoMhGza%yBv~dLa9$RFv(Y5!u|L<Rav9#~+I?v*|
zjVT2MAET@$t*W)5D1sjIM}Pcy9X$B4tc^_;-w#?^+P!iAoABt{`}<Vv?Ds*KcCcs`
zN{?*NtSxMz{N%C>7E>-hD(BX&NFF6LkTWNN4G-};qlBXZrfX()wjsf~?$&cdLn<<_
zGd3s*IISfHD7j?UT}cql4_-DNs6xOBTd}aPKzB8|I#Vp-3y7zySFfrPKWr_om|iLq
zKJ$SSGhO^FpM&VBE<!{^L?(eN7-E5r&Q7&r!&`hwa;7T_EQfQ=fxKQ90w^gd9*3i+
z;H;&Du3WjY?7%nWz9)+|&=wYkQ#CG7Z1bC&*xq>=f#Z%c&unI%!wF-&+@}R4kKd%!
zG?eR@XD*I%gsAvso-!UWv6Q|(#q;VNy|<WkKe&;fEqKh4U-f98Jb7Mp^p#9lyXdHH
zr{$4UywxN!EHo6cokroBZmk0bQ5rr6vRL4Je~??oi_yVVS_KpxoO4M@$p+o0Pa!T^
z{9GgCC4^=)g1{Sj%7L&ipnst@RLfk;DAL!$Vq@F#wW=c>)bj|SMyx;h#_`(IAi!E)
z_xp&8hu0KFDIUvafFa7v>`!Uqwk?8%;k!Fi(cIeV_h4rSgbIjP32A9aVm`o^XtG@e
z;R-H)1yW<%d9l@eO+8ah5_#cs&PnwFv|($fO?e6n!euo9x}G5{EX?9>p$J=1b!DaQ
zQ$iprN28nA*fc^y$#ATa;i!;qOsAlKSsBG|fgbTgbwEH;k&E)aa?L*=fP$TUz=}^|
zdM*jj4dMK2=mCh8Yd8Xk(yjv{tbJ3{Sq+^_AA&SL<y>kePDRrs9oH3Vc;X*b4qqx_
znCBrt1+iGOKOl{OUDP))*f~6W2gwmI@UC}m?!e8`X25C@W`e=OzCN;NZ4>vCHYR}F
z>6wEABOw0B7#L7!wjG|H*6HZ#iY!&nyDkCu3op?2(n8H79JgV25C@BaDT{$syDqV_
zv!fs?`l-~Q6`Nj2NP7(8AwTRY%djM~anFD2qxORb)?oJ_*zwwT1#1=FZxRWjx?n9*
z<INj#qF}Lh{UW;K@@D-a2|#-(qz1h-*hNKE)xUaU3EAS}mj!uwmMYkR%>()C!j5?f
z%pvX}U|IN>4B$%@;TF%HJv%o^T0pJH@kco!mzf9x?FOZ#r8fi)(o$2Yp1XQnxYIv-
zrZiwy25-QMc9DEZo8qFG{U+%Eu11R@FJ64WCnICM65gO2%Fe+NRSHWHPBj^yb)1Bo
zG=q{<Qi{SSB6690fG7M(ZcDp^#x9{-48R9b(3>I;IvgY&)W+V&6f!r*t<22K&(r59
zuQ?P-+{J~r&Q;K$v1xpfh=5q?^=n)>t=!z)+VKML-c6a2_&}T>gPsTqCP9ppfg^18
zhYmbBL)2=I<Adw<5<N-$t&r}v4OCY0TOsWH4+bo_IGi>$0cnVUC1%K9wzah(o={M{
zAZi73eIMc!LNz#qZOhoN9ug9^xtWhZuoUHU!dOuAq2F+>@B7Q3N8H$X%AF?%YcA_W
zt#H^H^^3OWKHhrlT6^=lrOVN*AEXsH9NHcSUs01QsU_-pRJ64A+cP4-fvV*eL?D7`
z;c>xg0@UEs4ym!Rp`+#^Rmht3+WNk`k2fqV0Xgpc+j$dhie@mdc~{^Y%Pq$dXuklo
zO~Pa2t=8WUnWX*BoM`hLJ?_L!u&XV>k4AW-H~GiZHqgzK?83s=Y`BJGf(($@8$Bql
z*>yA3Ok}q?`1db5NGy#?GJIicYuZXCV{j!M4s3RLxw4w=)X5nH()5CYlU8RYMXLBO
z8*2G7b8~VOFpe$5I$0lGRrj(|Oc@(f&(6uodE4Q}|7UnuDkCGq$vtu3zVZ7kvHBwg
zs3iQb>o;ylg6!;2xcKu&Q8Jv0|E0uuNR3SW`*=vqOl6!aDPD{!+?E&>51F!ac}$Ij
z2VOZi-lGl{qr8EE!9ehPxb09XYfusri{m!dsISm;iGu7^=Qb8rP1@o)ecNXiUd5(M
zQIe%FNEry(4Q~<$e?B^LPA28_2V<VvZxYLHbQ?r=MrCDeTbl&5E{aAinAA&`k;k$r
zLRxC?$~jrEP5!-(+gZ(pDvz!`=SogahB*EE;0ht1MeP06S?&3~Ug38`t3zl1D(a0k
z#>$o*kdCH?6$K5xm>htj`o`T9#=}emtt|tG7*^VMgM$?n6&;lOt)UA;4Go~4Ev2kM
zu&sKinsYB~ak}d4Y)LdCKr}&R1p5*d9nCO!1JG^&+Ix}oSC8+~)!(z;{}K?OfYh?%
z<^VL{<))*PU|?pBva`3(0Bf3%oDyNHC|zOD`jspRM@mAXa#2?dP(eXPCTL+}!-F=!
zr4zP%EI-D!;5sHUf#>&KnGA({5Th~`s1@nofLcQ*oN`GWIZC+@-@LQ6V7f-?mAJ5A
z*en4q7qAu_5=LP2hx&S9XsUs1R!~rIE_AXcVA5=xsIrgaG`$Zk#TuZVj?xBnEXu!C
zGsq(GHI2TlHw$E8vY_kJ?RMHI(?Dq9fbsV+-b(7|kOEg^H7;WbIml9ib*1Y8gOHFA
zSe(htOUU5hN`9O$_#4_(W1osaJf@|mU(X%tm}&6k$J#e!k3+}skoxKVnh#+#g}Vaa
zp-N7wx9H$fr>!Z7;DW((mR3h6hN$P!JbChKQ$4SfHft>?G!VNf>^2%fp#Z6#?+&d|
zr}fhTP`_<qAP5wNG%U*fz!iNU(+5`~qhFKOgS~v)^Tppk)PXN)V7Dz|;mw;j0J@5G
z;$mXjBQIygCcZ@H=j2Fbmni3!J*pUj0)Q(5^dxhKw4G#}IGD8nSlPcbq8<<u(jBiJ
zY^?qntE31yRd@RWl`aL)U$uo!t(c2IVYPg>t8IMo^1-#(ii!%m{Pq3NxVY?~5{*mJ
z-7R`l+ThB%dMhM=0fQgW|6U&+9_|^9LF5<E18@)N-WyK=UYc~Vw4y!2utA_*dUA3R
zAQlxX2Q#xAeFrw;;Qt&gzYM^Fl}zZkNQD0(F}Lka7Iczzup>oN50C|#dL*INK;eW_
zJ-Z{iiJg_1sXc;Z2$cJ}R`srz)U@^MV@Eb2BLiu7_Zpy<9}8<+TLlZ0S*Fa$*jU1L
zl>(LBu)ErT)EN^Kn$7o_oKVy&(-=R&F`@4tZVbSqUld&s6(65dnbCd_>V<Pm1b`O?
zI1~xI>Eh<bSR}78A_z!~-Vr$L=q$1w7!=Na?aK6=cEk6zl|N?BAbEop;?80HWzx=k
z>pV1o;rOLF2+Ak&G#u}t&^oLrkYJ}Ui#AI>MLBvM(wdz$oy+(Tp%Z`T4sC&X_XDZ`
za|o?P95Rpndw~SZ`k>C9@0pogrr_%AX6bSzK&Rqm?hu15FSzd`)Xw}*^;+VR(I7uR
zzvA%m>E12zm;HIFRRf#VARuUIF`=bece2U|y7Eg<(C?-uU%(q|Qh~ONt-7}ZI;_#}
z%)!zloK1HPGc;J-Fos4((v))_gq%;GZ#sd?oSmH&0d9+W$Z7|zO7OFhRweZ*)9<T4
z?Aql5o1oJ|?E+eM_Hd9quws+M<<O&JSPUC1)w;1`Ka+o*Cg<l*7z+rCjQm)iLiS!I
zADxmxyA7Q`Ldz_mEFWsb5yi#DHMg|X(%d1Xsig$~o?28C78Hc#`fu51$js;<M+stU
zo1AaYZ?V{OS2}JGQ@`uo?c(UAYlR?`6i4tnXREBKprfft$<s1QIEm2ao>i9~%a@Y_
z*8o<Tj+t3$&i(nGbPuE=O?}(kshOD=!D7Do8HtE?W^An8lX~bE0duxO@1Bm=Et+xK
zs~5T(tY_+Z*fBB74)`Olu*YB9AG3RF{h~;c6PJaC&wVl%3s(D*wYg2v&46VALBS}c
z36P=aTqsMP3`od%r}-f-v<HumkMR@vL>sc-I_^9N5dar;G|IqTY;2yM9&V`&R907W
z=rw+<CT)NCZAxEce5sk-euL)Ka5OxP<I#O!2Y0W<+pe`!ELY!;t1@tdAXnLFJiW3Z
zzyX~|e}DgP(BlH$zL*!jXkw9>OZv94dZcsP$-@ijCm>MxXN1w-1dtd+V(Q2mxX+{0
z<I(fjhpgHQ<TyC1>PTaVJRo!<1}=xA3l~7ody-?^q{@UdGCF$faYUNJ``RY*{_2X-
z{oN0}(B#tMb{!60hRz41osM_Jj5>2j9y1xui`~9`TS8tQm0M6Z2chK?CeHox&M7`*
z0*U;RF|tuud=t5hA#lOddzVa3SNBO-46FTeANh-yF9^A<NMO}k$6Lj}gx%_P!*_UZ
zr{6PCC6mY#_AbREx?pb?ecG{zId9f#dyP<fwWIqoL=hc0QQgLGk!~XzxUSW0vnV6I
zzZP69kx~7N7_3h~%R%C+rC#=RSAPaZPZs|IOSr7%Rp8eTNk4lAI$+0HdLrWF#MQ1~
z-o1}+%6M%pXUMFSd2(o&H4}1Eld>_Lb+gcF7QQGcQA<l^ICTsLNn_D%$}lNG7%Wyz
z7zf?ZfLX*$&XAM}E$xYm%Ns4F=H|eM*5<w;J<qYjtcq0smRq8IwL7F~=s5e=uV3T0
z9qeJerQBk)aj822JtNZI-j2V3f;Q?=x3%X>$btBsHWgp|AmelYEBBh+>vVh?g4p$`
z+60f&f1~H}q`sS@C72QIzl+u_7P&(|ZTn}@ihm^`hY1^Cs_=;LLRBV5=G+NE0jM%a
zR3CrV>#O+fcnr22B147HlwRn`cBO>~xptmDI<u%8UwSKPHf2mtLnD^+<40&}YHB(f
zA1y8OB0cthusaR9vIG)%HGvR9qj;R3o5#f&Ke9uG>(G3wTK?E3@GS-it$?=`nla@X
zn(g-b+`F(ZOS`ou9`fM#qs-fL7Yn%MW4w0L@$vD~yMl?aFsQZGOs~5jMe4-HUHH=a
zf5`Nf)<rkyzRo1wl}}a__=L8?1yt&?M5;^#)m5TZ+Id^HkjqzBRRh!jp)u@z5T->B
zL?Q2}*R$9Kc|k7$1~#(ubI|7oy;jh{DRCWQ;If?Ra+>zYs_|G`Tia4~E*+NmvUX%V
zb>`SwR_(accURx8ycrxsR1|vSR)%sPqABn*jOBFI%cYd>DVj>ZfB&pmRCEJrR-Q=%
zdrW^^HkHT*GXm(YHmgo~0$29Wp9{<VX^0>~wyU5GKHYifyqcZ0-qLu3xrGTL1F;~&
z{vJxvDY|07wTC+HVKB^+ubzkHd}d2c=8`5qI=BGIX(GQ<{$v>dGDwFem~DY@r+hD#
z7xaa^z|O}>W4ebBH1PkkpH0ToGVd}jnPUg}QE9h$8y*<;H5Jk~QIou|`y;l~>my^2
z3#6sfP|N*Eu9gj~F>X!2ogM?WMpc-ba~w}%eRn$lWqY-u+jitG{nducXa93%EY{6c
zhPILoK+2F*sO!V0JN%ps4<0-SE4DS@PmyC~Wz}|BjKUsiJx_$rI5-Fn)4AVnuH)w9
z$P_j}Iv}L@A4D8{_*!s$iY-Ara3f+^b<pMs=2v!iTFIP{H~xk}9}(^3Vfbj%wExTI
zVs9*j1(T7CqXU&ZyL)@>y}c-}3zQc$mSI?kazo&aC;h~lP{Iob>_)RP1vNF7$=HOv
zb866I4`_YX77a>9txjfFR~OP>js4r-Ux2AD%)-6A$gn1HP3v)k<C77_ltl6O?+@PZ
zg&y{t7P*n5hPcQ|M%|m(JFL8gm3yi`Eo<|6LvO)itR@ZuWf(;H3WRlsfJL!I5!PaT
zvN{Ir4r~ebYrB6;V^g&dJ7gy9IYN^hnv{Puk?7eF(#h)SVI=z~ZSg^D{(m1?gS6VT
zcyxPj&)}rg)G?U}?&W^A(+UG@+upg1X-!QoD;Y8PA)&>^>|8aMb;bIxw${d3LOz_i
z!@>QzUX_1%U<3niQeF5inlxLfuHe^{f46cNF29EM#W5t!E#F`InN+$yD7ZL}g9)|j
z3^@RHL$K$uL1His^`H1HQ>JS1^pEL;7Hg?n!ll+>GL_F{);%hF^)XqL9w%{>cNM(D
zz}#O|>V2P}^^hOny8r;Bx<bKmQIq(Pyl7A<h=EFe;3aBpy@hR%BN0xuxU|GT{$L@=
z&TzUD$delUF-`mUxW>SWgX4}w>qkFE(}gx|@E8gp_~z#6Tj}!h@|4_V76l-70Do6-
zq_3=~NM|bv132kmk0e3-erKf0=EeHBzO}No#YzOCk(*luVk(6*5itCb^&SVV)wuT|
z+=A&Ww5<SdU+`O@HfGM^cK@U|KRI$eO*{j5u-T^q{0vZIIDA#O7<L0~>-(^DW54-f
zxw>Lpb^fcEU;+OV6^!&}mF*RiY*EqEOY+k2t9BNdFZgL*u-uWr#*PHIx_*g$bpokv
z_ZUDa5<+c8?6GhvdRB6^*y4!O*linr%ylrR=+_1sHc=59T~Q;jSbG>xa&yJ?ipv&8
zs@rHgmiw-x(8)3-WNI)XrJARllbgf0PJm6q^ZHmLYmh!r>k30hXFn_(iw&gQuN|q8
z_b_}#My1b=cIMr|wx@fXWY0Ocmm4t}0mvC02b26NFE1xJs(3U7k_FnN(1s|nJazo$
z;u1|_)jK(<MI_#Ckh>eLTY)xE!SwX#&Y`U;gT^ocnAVKy@xH!a5S(o=<pO=<KYYlT
z;BgfqW*?9nV?+_~oG&%hSQc%_4m<2vaSizV3cW5pfvxl(>Zx<hR%DjpE)ET{PirCn
z$o=5vPktoeTv#lt1|SK1UIHBm5G21~$PHW%qWjdblaYI2(sEI-OFW2B#NAy$I+{rj
z1jH*bvE9?@OYK|uRG8$hryCVfm|8t)2OXE-AHyPJ+rBsXZFKx~%<$64U5Dle;;#mc
zprGJ*bFW3hlJ+ZAR{l6>p0k?|Gwr(yDk&*}*}o6Nev8rG0$*ETK<LUf4BaBtA13z#
zf<T}$F$JNG@%vRp5QmY>-a}^3p!wNi!ioM2#4!FqJnBrxB8Pi>C7qPy<UZugr*5dl
zC3YC=i&y_Zr^ZA8R{A3SM6>sU3oE5MQZD_y$!Di27fi@}K+(YnLWRZqeH*8WHLI(N
z%8C=w2MUAte@c|=cjQSqJM(}yxE8PR+v{R4Wq*AX&!k)v1TTaUjS!3w!GtSJcTQG2
zp<A;V{CqCEhRt&fs2yEt?jj^L{tQGtb1N%fx{XT$gY76c>n}aORV6S-z^o569;3EB
zgde_=9{KV^ks;&8)AHChdpkQG=$|tOHp3nO&%oHo_bz*n_-HYZ-D9GgNVSFhw2Aoh
zy(3!~@a0Mf1|4tklo009u0GK_Flz}RlZQD*#QDj=wbza1!Z5djK)3XU%$hB@_BK;4
zgBo;aG$n6sZw(jD{GtEG+p^NXzMjC48yI*8Tw3}&!`EtRUm#`J7?hWkkx@h@|L6JI
z&sK|C`%E~y0=^ofv!wHb5jc3)x?8S6gRYdPTzS3{;%lh&NPkV3aDmukZf=gp^NeTO
z<M1)GPE9!D*Dy4r_DV{>+@yKLB&Ci;=UPh&;b}nBf7&@_W$`ej3TWeFEsYfc9SwA^
z0cI<^px?qfM3IR4j3b_*009a*%!1q`QmYd23v%R}ckUWJm)=hGJU^M;+LDLtZ*+7N
zgfhdtW)-$9mqxWJW8VaGGLJ1qiE+1nOArxc6h;=Kj_wn05($oP*t0%)HPVhn#1{$k
zf+e2kuF%R<g)jrgqooWDOul;8K*!cI?`po#!JoHi`~IO2iA371|7Dx<IMjk^w|QEb
zC(!h$<Fi{pe(i*=PB9Nxc@o(i^oM`Yv9=07K_qm4hK2@I%oz+#lY^%VY<>Y{1Zr7@
z$sZbvR4`4bP+{?XVO<S|lvS%8^YkU4f5oIzM^dZZi^9awKb^&-zUr{{sST#NXJAGJ
zTHWE`OVHD0bw_9R8D;-=cg<MrJ9%D<Whk&2v|z#D+-Q*jCiEs?W`{;T`6r<s*iCfz
z0cM@0c&(u?!}K*+Nqx`F$Bj?3X)-g-=I)4hE`_G=4DFMa|H3qS(_Zj4#)WIj<oEnv
zm#*hqTJ&FMVtjBX?^gDbIsd2?uCym3=WE!Ye06A`>V3N9NPfMZ>UZpac?SOf+sxSi
zGKBvdgMzyW$Ffe^66fZ1?hxF`;MXg^HV*!^CXw2?t6i@r1nruAj$!e^i&D$dYrW3u
wE34nkL*M^Cz)vpvUmffJ-@MY&<+al*l+=l@?7ZOrGaw{I<zD5#(Eae=0KqK6lK=n!

literal 0
HcmV?d00001

diff --git a/docs/_static/logo.png b/docs/_static/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..8609f158339916a5681fcb94743e09b32f184a7d
GIT binary patch
literal 13387
zcmZ{KRZtuZ6XfEHySux)ySsaEhv06zSa5eI*amlky9NvH?ydm>g!}%7d%LTt?x~vB
znW?Gj=@<<)c~m4KBme+_s;D5N^<UQbAAs=xaU$k@EdW3f5};$?rDfqu;pX9L>)>od
z;pOjULt*3RU<&~FtzYHoXIcw-xO{RVp+ONf>`By(5W2m%lK-q^_^tZ;J#Y%5l;q{k
zZq6Y*2=F}-|M$@udZ7N?(Pd&(uUCDd-sbFntNdIf{^uW>SYO{x(S2}C=;xmQ^F!pg
zcFfr0s~z#qGkPI=pV;jInZ>_5|CPS_;eS1I&ynQ#DxIE(Ss!l{{$e48=&ym4!bN6P
zPk-Jt?A}G{Zisg@dOzzQ*sg}2ZU45veZ*`=&$D$ikN9D!_w~Km!76@!<MCVEp)_1{
zJrcD_{Pws_BnI_5^cW%%@EHjDhjB>W*C8(ddEy$16hg9dvQT7c*vGk_)fY^e-iQ7V
z`OcMJO<W%5^D!*)A?qyYv(NWAaMsR;TA#|OkML+K>GYJl+a2rvQB3dE;oa<|r-%HY
z@@)@ax%cBftq7MzM(?OIc?gYq({1|zDUR~k&+>qCbVTZ`qtK}3=XR{kM+C8PQ;W78
ztG-Ag?=QxGB+7ozt-gJ@LcZJJ_R;<kX!6%@;@3qzFFUX9&uozlFaNmp;+dH|38VDX
zLLb=hXf|&;Gw^rG#dnUL`;O3mOd@6CAJ4#kD=Jc+!U{Rk3%JPlGuyk<mKtdN=mAjf
zZ$}P93?scL{rN@|6xSFjK_ZOIL*TC>sHR7ej>R7yFIY}w8Yg<b9aItdylr5a5J2Q?
zNvTxzqmeHYJ}$$PGSuk$aE{@}p_USBt(SgHe6C|%`SN;|#MhQZ59Zpo<$z|sfb)*5
zFM$_5#^iXcVDLy4?}aU&oL4w8Y8|iN=WY(6U9Nwop<Q0^I!WB4p?`+1zH#sxk??g3
zIL6d>BY1<1{wIu9e$eEP%7T!~c;&h+uLsulZJ)PR;gGs@U)|69haH2~ThrDTJ*68b
zas324QzvmYOjvqZr_tCE5`()6T#Ay?<*Dan&&5fH>2BHB?=tX)Eqq@QY6+CH)uWH1
zikzOEj{YfM94mdY*BZai>diQu{o~i0Y4i#;ZSON|Wi2#z9TC%oWdD_Uy8C-JG$JPB
zcwiR%-iQ3bjGtTTYQB|*=3SwwN7OSF<kXsclHY2)D*BosB{<zBSrPyB0^~P6k0NcY
zMddG3mmrA!N^rvR{71{bb<5kgwK~!j=O>vkwbC;zmY%Chdwd1{AJdyV8Lo%K)&o)8
z6gN9ZF!GZ()sMiRr?cq7v`cUACkr>tzs=6QHGKY7USUwVqOmS1H)d|xRn6_JRq{6p
zY#k<T*ky_wt0f>W&ckG2Jg+P8ZOfHwxSPk2Gvks?+!t)y-}on`3(~md8arHbE#Ib_
zRWwk1H+(+o<#!o%XipT849&wvncy#kJ0)M#t!pN8<j8bq8mOba)J6$5Z{9ZGg(hA+
zN6NLj#-W$Ie*LpPYDBfIt>0NZousjId4IG%`uHZcH`BBrXj5j(doBDX<$;5jxVC&b
zTg7s5CEYB@2(!{W#%sxphH@5xv&6xdk?2PuYT&40eTDP-V)8y5H-FBr(&cnAxv)k3
z$YqcssQ_}B*-%yFjx!DY1%GyI{ntH3dZ~=z7@Ax&fA9E?>Ujdf&)KB%Tl^aCGxp=*
z6xTj>3e&PhtRvWVUHCzX?Uz9-NDI)_-{EIuQSjIBJsD6}INqhd9Vg;%H9gjTX_%te
z`RXSfq<FrdtW|X^S_j=nRh2DQ(_f`*&S7u(&Lixk?bKX=J(yntJ?UO&<l*!qS(n9y
zdXu!+Mu*r&t_;;Dn$MK^Bx?EV=0pBEggSOnYNLAwfe!Kd$ACaN#(qaR8%TU*nOZkD
zeM(1x|CtMFd%byU%elMe>IpBQ=j*ksm`xWg)O#eS#zKub@<u8-I-N%o?$##`_?XPo
zzlwL%;}Z9(w&;9<*P^38W&*|}*rRZ#b6w|&QqUq_9rE$j^>Wy|2qnxUYdiN=5RN@F
z8Cf9GwmitUJmM){1S1R-Q8m5?yU{N4ohy1J#r}YjrWbJ>;|v|Aadwq4wyELqVO4}P
zrO@*lMeSN_{8aXfsuBicuVPkt^qR-$evO<*8h2d4QSGaeXFH<4Z;g$>1ezjNU|%6d
zJ(?qpzzbJTlOz<V{^vlTL|?~Xc1)7pz0KBq=KI1rG76)$>`pq)h=3nBR*W)D3GlF)
z%{o6oa>-nfYi{0z>_x@q%bA36LI~4NX9DLi#dh8YV`#1>ibN*``*|CZGCe7N@H$KW
zi!dFQHr&!7oTf(8J4ypUIz?LL&*ctCv|Q~Wk5?c9oHAyerE|4hBZH()i*pzi9W&=Z
z+X$@Ck|~I*Pt|VSB3j;cWc$Bxx{-q<3G;8cYykvineADI1z5ny8o3yhTVFME>%z_A
z-0AasuQOkEwx8PxFXKtVN9agFHxaq6*j`~~$-aTE2>ks%wWJ;>65lt~bH+_uLY4G_
zb4OVs{c0qw6SDjivuIqOiJb^!`ZE~ej17{zb!&QR!U3$f>9B$bZgV_h5|*PDl9cez
zk`x#i@iv?)Q>Sz7?IkjjT=(v2%7-LMtmTGC84>3Z=#wIzpe)o_nH;qT8+fkq1LY1x
zo^ypeb(;Adc}sneFAv1YS%=#L^Na8%Cf4R6#?+Sex6Q$YO=y-!T9^1X3_VvI_cfda
z$!c{B@oz{5yecj{5nQVuQVo)=a{fl{;7z8dWJHS`Na7}~m_V6g6KBk$G2VM2j5P=r
z=m4zPyb6l?-3Uil50D8fA^pp_%qc;E6Z)>H-ko~4>JA1Nz=%jUIr-~94!FSyYeaAS
zC=Q{m)<byAbzzDZfO|uk>E)1{GF%%-=1+SzManNX9Frt7m(obUai>_Bv)rvGc1~QZ
z5-b_QVoZns)z?u~F)Ys8*rjc0>mqiGa5a9DIOIzF52}dEk*qj6x+{;EXI|MbUiDaf
zqgR4`0fDFZNtZ}!s=T*RAPCE)Rnu7BUNVAF&57Hjd!kBaU`V`|oNcFHD^&kUCW-ig
zB|UAc><>L$gC>*qB-WR{Wp$+Ife!mD2QLqMin8^!-{^~(@H1XkG`g3gmnuJODtFw`
z1d~XhY@x6)&6)>2wvaq6V}2?Z7<43NArmJwD^7V)4a_o*FJ{BhDE{jI#YkacM}#17
zw9Fw2e<^p|;rB#^W8rd3;E5X_hD^$xv2a%hwugYQlS#jS>0Unfl;*%cTFk7_H;lWp
z_}Ps|i+;4MfJcLo##I8UGVVqm(J+sMg`qi=FN>F!fq-kc={EW)8TEMaS2=jGrl|NZ
zTS1U0Jrn{-T$wGC+C2F&9I~lLgC1t?Y-B|>3QKKhSrZnCU+a))L!Iz=x4OEPxKJ4^
z-+cBGmaQVNPY$}9Vo?-GH-lDpZKf!OSK5cRh0Z=&euPs1_D7U_GJ_+hdIf=EB(9z;
zTrpZZLM*{FmBCUnZ<dHOB1<X>wP@?ON&;5PqO@5vHfS04oI{PRDgh>m_2|#x_nP4B
zbS-j1eX(7{c{*d6wUe<H9IXV1MX^BXSAIARw%9!8*@JH?YNS@~lr09eW+jc>z5hO9
zD|XqR53**hGB`4$10ib^TngTB*4#>V?jIwTaEn|N@x!Jpqmb#W#R8M?^lEGWmL-Ug
z9}F*gdJ2Wc9!FnRZY7*tFod>rQP_zQygFkqYDymvO%9IV)p5&R>0@egy?z)S#pYBz
zD`qGEj*e)PdFr(({+9>!X59MH_-q3dlFCC_?3Dg~h~L0cQZMR?6QNzp;?N6B85#x=
z8YDZY)m>s7pCK4Y#P;=n=X#{pr$C5pOE^n5=_eon{CrLgd@`@=OD5H(7I#cdUbR1K
zt9YEVSAl9I_^sNluJu-oKc|RcA}ym1%j$p`P3uFC*|rD#8aeudoomjreX7dRKqG<%
zulKrwqaVPrX_X+&LenHbMIrQrm@vcA^_&9wHYYFArjp(jV?rh|senXxM#8t7PeY%5
zo6cX2lOvUfD|a8Q+#6zy?k#E>8yIFa=Rce^0W%ryS0x#KGuTGxK2ajeRfVlNUs4Kb
z4kQ9INK@A2BqcC^eJME7U(*St&>?$YBJz&TTfJoy!0rF}QlW0252^bl(zluo(rJ|J
zkddwga{wpiQs`FY%9*iUGSTndiT>hN%MKH>+-GQ`qAN<APwxg0(ms&6qc&_V1`0S~
zW=$JY6_V^LaFxPXICyj-+@jMUzI3{Kq*n~gDxH%ruZlaVEJD^~*CaOe@E=CZby)1C
zaMor1;^x<ZD>7;ME$oHKMA3dIqu}Pxsq~Z8^S#iy37*w?GIoJprGkG5i-iei_M|iN
z=q&BH#CMaYkj?G!Yw*_5r!iarVMv~Mgb;H%Fn?}K>xcs5Md1A;_hD3_IdiyHIhH1{
zC{1M-ZN5&>)e{ZI5a}03NvQcTCGQqv>C2IYYrPm|qu4Y+Q^N#9DplB^>S#1a73ma(
zzJjfcQqXnG+*20WHlYu%rE8aBM`8ugSP_w=eeCqeQ+SifguZaWEM__N75ldPHTL1p
z!p0v@T^2(l)YMa9fu$5ySkPj+X&WrlYNmnFo?=`*rnc0%Dj>)obu2t$?45@-HZgQw
z1-nbUG(5pDlW(7(ue=`G8Nu|HTG(yH7n#i0zzOeLDsM=9mb7zPF{h$9CT+_JTqmP~
zj?b0JnOJniDqToV6-CaS`H#U!nIBFK^cKxpVe1yhM_}6Y{FP$2SSPk!Aoao`KpK#f
zE=ao@NO7fzj|B8GQ!TvI$to63Mv05NJ?0<(;p|1()Uz3J#2hiU7@S#DR8QVij)F!Q
zH4b_4<x%m_A@`rZtx1QwuO)AQ2O6AS6PuH!`HZV(7!1k$Z8$~!rD?o$IU&#>z>4n<
z5q+xU;jrtW@J$w3^i54*XBe*nP`N2~$M~?$BJz<S+@Dqodli`x(ZcHn_A4*x;Os`4
zP-9}QQ$U?!OqfNK_VjQo*(S%^%bm>&^bDa(m)d2H**emR9!m{;H-;IxNRijQ={W6-
zLm-mJ$`OGaI_qp;dy-L%J0!yxTH;z2_9OF<{e|(S4+a(z_cmBqdAF}HBGfx=W^+`b
z9SXrgWXSO6Na)ce_ibCPv?WL8vwmoj*J+k7x8DPYYDvkL-`9%ax}rZ6uyw9h(>XJc
zN=r4RGPEB0DdQ8n#Z)JT2i3617NYnod=yWoFUZ;>?Q+ILm0%5(x+hLXavh%~6Y_sZ
zeRzdv2H?gP*^3d%1<{6I8E%HSU&L2ce+rN;P;nH|txqd-r1DeH`IsxP-^Q%NQW#Mg
zOXgCeB|4gF8K5{(RpNhvWeHD$wKAt&1avamyNLn7pTrjE1|&^`sjxXV=F-@f_8)}y
z210?AQ98_n|A2d?c-hOrVFTLbSzP=yyPiv$wMF4Bv4?`ZG1#r-6>NfXR5mVQqpM3-
ze+OJHtS;tp)#)X$8UYP9GTy9YZ)%Ch292KgB5ab57RG&!bm;JS21%=_4AhM56EaG^
zX=BM>pne}@A|O=`gpDO0qJAMf;Q!9osF!Lsz*PBr3Gl?Uw&7o&4wK)_M}@@>8zDXJ
z2V14^rom{<fy4{FjV=-)PKVk_Y&&Vi%?(g1)Uv^bi+eO#wF|<uf!fEvzK{&H_D^Q1
zKJN61FmQ%^<-ue0PgS&OE#H7+%af52Zk94(m!Rk4h$Vg}`64#d;*Da1Afv#cx>rnk
za`Ai^k3@+mOtJ7tK1zPpMoWy$I8kG;Qnl%L{uE_6;5-E*m%;@4B{d>kSa1igc!G*?
z36w8;8tt$ZAjA$srp-AtbzM$eEPNT+pR3LDhZH55r>gYd4DcbkNEVv-I}@ZaRf@?C
zq{tK6d*e8-&^1UOTRQfWlAxN0YGfIqKP4k?_Z`+dkW*n@L*?S!<C?(I+*+EUmc|Y)
z2*wcfwO$`&@hpcR7+JJgoYw)M0A%Fihiv9*WMTaTP{W4sfrc-K=cvI})^ronoZ(tD
zFPfELzwUXCBeIt8?}K)Lntx4zvdA9}=tMifO5x#qiC(ZKB!YwuDH^G6(!}XKDoQZ?
zac~HRpIGX4b~ex<>~$R|^(9EWhzvx_cGOf8j9;8d6*kVFuNgkucpwSQk6u61nqrwG
zmtvvXMa5#_Es!7W*rHI_EVgcZb>P?X!LArPGmHp~D3AQQkBuu0eb{(El&Y{XP(|(m
zU9|Elzcm%bj15=HWuaJ~SM7puNM&ib$VW=0-=*ZJ{-kc^fcNgFIn>tVrJ9Y(Fv(jS
zXB37HhSsQ|!xMBEfk&GT-fYQg(`>IATiN8{*|xJ%NvMTFlIXEkmEdQPClrwY45DvD
z_FFi%#aDG^rJqPaLy?O9P^J_tNkBSeG{Pk)7s`gQ%qyuMfQ`1Wf)qoYPTfB{>Mwtb
z$uu-@@y>(rEV2s$e^-f?!a1nINI~a+%j0q6Q&>0{)|SKm89_As18gdc2C|3N)=Bz<
zIzZ+GX~MzKF96e+4?0z+piq}E*!&z6y5-8#uI4qTo2x!YWm_Zjz$@X1h9A9&X)28M
zSwY7rP00LV1QV8^?sD}#^3+RIK{oN6R<34V(}2ZxC&dbtr37thpp45h#Z6zFq)nRk
z5Rx$KQY~CFxm7fS!GuRtdJkcgnSr&}NV<yo6BjU?{|&M7W1H@x-dzd41BPP?LK!aW
z&5jr)ohy=b6`t5p>|k*>I))s|FcH&o*)V!hcrRISN4ff-!A;)_gE)NFjQjm7P!Dkq
zN_2rKx_O}p6>glSX&hHt^)jV{Pw=S12fl1+Y#gOiOdeXVL`$jQO9HD3Uz1nqabg96
zQq{DN7n?%^68||*oV0hM=P^>5(0ie7E??o%me$x8{0Gu7`qBFeWnwpk$xwIM4qfL4
zusp5S*YjVH-3bLL{vpZB!oNN+m;iHX$dnm%b1Z|T2Nu$h^?^FK9`V{yG`o}vkGe@T
zqG#N?kmQ&t@4*m-rnVMi)d9ctnP@5sbSVn{^F3@Zt#5Fc$?QfQbm@fTegq1NwJIFd
zf14^CfiY4jJ*VfqxQCo$;$jSqiDN2&W+Vg1Nz6|+rru+*5ZB<RS|o5xzJ=EgGlE*T
zO2v|x)>%GqZ4XLLk5-FL-V{&&B9Dsah#Vo(Kqr{^5|7YZn?C1{8Zw%Q9eu8-N@`6t
z+~%z&!2)-T*OP}7LpnLk@9PJ{@p(CsPI(9Vxgtm2n`LrtWTPKte#jfI8_m0c|E{q<
zKPc6QES|=bP38SJGrk-t9BUw>yU@BP@zjZ#h6*)gY3O$d(;t`zi4F&HyWl@`*|qGl
zdnr=zz`|Y$wy!`f-#JKCfYNE)BhlLGx$no|I@WAET+pIT!HuU-@Azs`=LjI%8^OcW
zg;;wTZtYJVVdz_-xkhZ0Ev}7;q#Fz#(_Ph}lghbRvg$atojs+8?uH5KRoGE3Z5RpE
zIGnjbH-96Fc4aZEp>3<byGM?$cvo$fif{Gj^6p1vlA;}pxy^)@^;iv^QGZj~{<6u1
z{F}i2(t0T@4*O=rZJsc!OS(cl6%`r*$I(d2jK_rtVMz?HVIms0P>JO{80s<85%KWB
zUX<jWagz9;0#r5w&$(0DGyk1Vtd3DEyH5HKQE(Xy5GN}I{gyeYjD4|i?3+sNjPPVv
z>W*@7D5;W8;oc!4eU_;9?n#A_O*w=WKa!Wi97q(76g~fiI1s($0t9o0D3eTU&u_9s
z7GJ^_&;5Nva~nl{_V!wl;jOhDE7sa|))^ulMm3MNJ_wk;8Gxdxe#ERJ2^qDaE|8D{
z+^rbPjNGzRSvWpO%t8a#pV`lyIKDnPdUI*F*SI>GXdi2~*f24B3827)#n*`0Qn85*
z85WkW%y5I@`v5G^VMqa5FwR{j<22HX4a6`egCZcintyN@dvr<-zJj*Dmp<}i`)woN
ze5SHk_CVRYy!VrRqun&%zYf_IRoDd3j1r2OnZQtq)ly1~Hu+#7oiM)OUxa`mZ-0#3
zU`<dNz@Zp+QilX?3Xa2}Y1dJ-dOlZ#MX)^6nQ>iGE!ygPYAW%Q6S6d-5cyB+<7qRJ
zz8Ty{^#exec6nm+Q&Lj;%k{y-ujj#<-sZW20*>cAU!pUNqS)t#2vwJsD+6sT$h@ao
z;vNKVldrrHhw&i^sGn3<)RyUCe-06aJYj?`3V?JrRc-FG*kRbN1uda~rH20Z@(&Lr
zSg8vZFNxF6nR^WDZqt~W%haKp`;Vy0A@8aM#QQ_W&)7#TedJk<6c^KiK`GO{r?9t&
z9*~`k+)7GaR&;~t-(IZ><jnKtQrQMm0g*?t)rg9P-acQ*-pVmRnVkS=Q`*m@7aayI
zfW`PzZ|A;C@;>+s7G6_x4B>voFviqSU}a*?9=3|w62Bs+1&7=Jc$z`%6!sMZ&)Q_k
zU8zMi{M0J|#;yuRp&!C5R1k5Z42xr|nA@oJOeLf#qvDs?u0*h?_6Ym%VD(8n7l8&i
znJerOUM5C+g4AGocQb_NDf3`jUET5&2Axun4tEqQZB7hzft|wTy|Ba}N2y>V|Fx2@
z1pSD0_)nuy4teVYYSuVLKWSAS9n~)!coPvzs{V)^)01!FMahJN8jPcz{#EO%DnW!#
z&F8f?%;Y1YdlTLMMdDCVFQr(t<}*yXL~q`wluG1G|MUEHqMWoUo<y7JEPId9T`)kp
zDRuJ;Nt+ndNHl1`TZ6fYC4K^iC6KO!X{9Z<5;r0xh12rIpBia3H^o%><uP1eGL%8~
zaEfCjARrh6&&A;yc;emhYJ1Hujb++%%e=UuXgrdKU;s83$R>Km^qyDzRvpP1fp-or
zctNo^VxrA6TJ(k8<j$Di*odQ}8fS!jTwZxCUs#U3rU_j>C&InfY+nmY`z#ubC9=5;
z@j1^>!!rsl<1^x?r*!XnjfQ-mQN&<N)cz_YER$PG!dWSU&~A21zNqs%(+$WX{#~Ge
zWN{j#D-B3Ox{bK!_kLPxTx0w)y!ZD=>R-s4i_|C~CjdmXYUfTzoZ81{y)73aDf;0e
z_v(uyNiR`bogmNSCiSf20Io?R!)+)0-iE%J;CRWkVmBG~Stz@PD-Ej+Jv=mP$JpOk
zhw7Xx`2l55GNTvT52`qB;So~1k}c8xo>nGNqUUm%F$;BLg&3on&*rEhn#SfpOCJAk
zxlwj*4C*Puljay!L~C!Iy1izBdOc!!MMf+bU%|>U2!2Y<zb|1VQn{(Ud`DtDR++4r
zaU>yBN?XMnt|1Sj*bz~G&@Eu@hle+>7oE)jSGchkul<rJ@!AiQAG4N{@(mnW_R%$U
zbwX;E;}D-Lc(dQ+jqy}_bssD3OO&K$7WKyx(_J_bT*3B(goKbEwu`Bq{%@9k>9xi3
zS97uZ1gm-*dbSD1o`IE_PFA?y3P?0M;T)u1mTG~gw{|1nBt@K@Sk@t27W;_1@^w{o
z{RtSZ--NW=PQuVqn^!L^fE^n^Em`EY-jbM7yrjg<V#`jBl%Qc2P86Bm8M3q$P6uIf
zh<uQSm+@O+1H%h&tpQ6@3#1i9v^h*}r<5euN_87X3{;%~IR}qK(I~`@E|N7_+=EmE
z5M$gq84HDPjM2}~U~r3)mr*liloa8h6wxG^L=)u)CtM5?{Nw;3^p@Ooy97(M#H*`i
z1+`vvXjHQ+!EKM5J$&${H3kbx138R;tu^{it^K%dHqopr!ii7O1Pk&s{f`?b7=LZc
zZ66JQmNZ*8;e#yMguo!wDQ(u8Dn9ZsMKy+Vir@2ha;+<pr-^3j>v}(g9Ji=Z4hm-E
zvmoJW4cRMvgQfDLw<7y?3y9ND3+c5C7w?N4!AXq4YK{Ip%9q#mnWYncyqEB&V%2ED
zUWX<ivfZ__#$B53Z_}VgAsn`+Q&R>yGq3RY$3;y*D}oo!iBhe+E-XowA{PpLOlk9?
zjI7O#U2ZMN`@xbj?8S*p2GaRkC{p%23u)zi2Na5UwwYIejB&cXf{Q=>EueKhcV-gN
z)x#a%olCZXHl(rXmb1y<ltR<?v(@=tSgbwPi)KRgj~TT4QkQ7G=RVqFBv9^e?NI8+
zWx=6*O7xE=FEIj2N(%A6@p9@pqbm-0QFc|BVUN}~5<i!USyl9kQ7eEqDR__)hO(tm
zn33-vYp!q2Ve7pB+x608@0Fzrk#+jW<wUfe0jC_y$M&8ttsnpWkhrZ`G=Ddiram3v
z1%qjOOQ)2Lz2kva39)7OGj`Q=+exi!hh6DRf`~hk%<J^jLloUbl^kJw%sp*6CQ>+h
z!HTqM=DZ}@^u&9pl1%%87lDk?<&HNBXs}~{UHW|2q6G!LH@&Y16vfW1=T?h71GWT6
z>Cat-Wso|^@La+=Ne8~kVd*;XJ300w*Ld?5Y<akZsEHjBP|UmsJivbcc8*AfJ@-Q|
zm>Zp#*DLcdkmT>gCVUL0=hac?-QTI8DJ@y12qKFBRz@2X$=pAw?yvokn#IaHu)5!^
zOUKdP!$@VTPfz_9&)Ukk?b|-gw^^c=yB(yL#%iDd%^nlczkTy?f(GnsSBwbEWnc57
zLdJ~0n?nQsyIRFAcKIjVoI-Q}l1#qdgiJ`ixp^_$aE$ou=RCiGt(ws7;>9u|><=XV
zD3$k(<9W#~s&YyH6g`LRfF%EvT{tcKr$-&ic~Z$@ynjHnYkDCGOTT2_L%j+UHu;$|
z(^iI9NrLjiK6nt4K+(oU7rm23;sw(VdT1~3wzm{(V)mb!fYScSG$?j#=JXV;yR`n#
zd9|_r%67`)LKYl8${wm}WyN=)LvTFtNAnRN)~ZZrgmYj_+3qn)&1vmdpA-2B>Wh4^
ztkg=Vqm&(tn3=AhS>YUJ1HU==-EJkbDZdB?Pn`aXw>fOsJ1bY8dR?e1vss{+8HeDV
zHr!3`NGhM<Z;CHw4&JbP=gO=o$2T`WHqPRN{-VJhH#-_bJM5;R-3-z0$A#S?d-`-=
zRM<CqI?l>1X}k#_(Y014>Or`ZHe$l974C&9FQVyI|Hh2Gb<WvY7MY*g3n1U2bDPXp
zK&0opm>R3>YL-cx<N}HhLtl}Nu0xNAzkKePDeP3=Z-0oNvwYV5Mk>hz0H7irq@^_!
zrKSI`6X8GqL4I(GsKSs2PNceaxoj2M6~%sXiI5=%?r+s<jR>R?J^Q7B!}8F%GD-(`
zcN`OBw0|smq$c^^-fp<C1LPR|{PeQ;jCZ|db5x_qNzd-vqTimcVIt&m9Q0NQiqCYp
zGos^iE~^OqBGD}#DWeH<3`g^6X!i`wo)=ZU@ed5QgG2S|bDIfvv19CVEc9adqbvEU
z;ZK71fy|RkcY=2iMSPl&!_NJC;*+o;=-K$}vqcg22m=<chWUXSd&l9lVNKlyMziss
zH6GjhH5pDd-s=AbHPp-ukA-R}X&Rzw6);MQqp&6+RB~Nvv5`0hW1{cjiDpz;Qgv7a
zf_NHW__e$6&w-mYqt_UBenuRGPBZ5>D+1sp^mM07DEWrZ1MIChjDEm|nFC;<sBC4!
z=n;6j1J72sk=uk|at(k{k8++&JglTWeCU584A#a2?m&w=U2htl2m}M>d8!Xl-Q0lj
zI1GVX2U`O(9UmJDxjX1qeI6x)jVPx7IYi(smE~mspZ|;Ep31cUDu`|hhMoWb+R*<1
zHRcMj{;!DOrKl>4a0G>ohDi6`zkdw?VD>1=Nb3C0J>s8kqT4YN`nFkBcRe$g1AwBG
z%z*)ODAIEhNcqxFvxG$_+YLgZl_e#EkdTt#CTW1NQqj=g(IbGAXDS$bl7q?w&?%Cr
zu_$|-5i}B&->L%typ>kvso7D0BV?DWdX`Ofy)PC!o<r<H9~c=*<GCjPU-S|U+!C{E
z{hNJ#GHCmfzn-2J7z6{KJb*zrZQ`O5a8C(~6$>NDKf5Xt)6`qNGmzUrb`m#ufK_7Q
zWXx8w-Jb*+;+I$b<<%VnqF+Sj;L~FebK?O<xW&-z9p~VvIWYWFpg>uJP9ry*_-%|n
zxWOt7asiQ3^uXm_xKq1p#n4>NJ6G`UzeO8^8S`oZUCGB{Wy0WaaC}&Zg$S%?SlJtL
zq@TA?$v@Enzz{!(9Yp41JmQ+TDgK~zUM>o~=e4UbC*uZ71<jSw1uTHSgxf4B-oZrG
z0L<$WkW!HI^Bc2+y%^?Q2Rz|QLH@;7DCki5He7orvha7%FEj%Q{jhN2SVx7Fr;Y;|
zZLZQ{vRVz4SmJO2Qw_*3TEj`vy+Hem?hx1LP-mJRPnKF=-c(gSaXZ7Z5<!?pr`_jr
z?49%lWDq@A(%>5~Kv`(}1>HvkYE?}7p0P&TrN|)=>)3olVK|}zeQv=P-k!Z=P2ogn
z^cd@zx(L0C`cr1#t-RtY$%<bX_nN}8H_pLXUMAfuXq2s+iFWl7J|&lw8+wB&zyviV
z;uOkpRF-><#XDO?95^+qq6O>$K+2-{Dj04FW7WFIvUSt2nb*7hIWZJEyKTD=MXE6h
zxeIIiVZs8rC9%P40qy>rnmfq?q<$#8?OCfCt5XQa$|m8Ds>+_29f^WqBom|WK$)Zr
z^EDS#*kBLBpOdw(hwz~KH4)&uvL(B@-2q+6GeZ+qQVw=0H>EvKL#_V(ihxEvjc9iS
zJ}GEVp=<q%^(p%!`tFU^C7gvoL(d%+_A$dKq*$)~K8$EcwVw~IS5<Py^LDGB^AKwg
zQ(HVGQP%X}XWQMA6Zw*AoaHE_Qe&1wWgipT;9V3xY2JhVH<yOvJO+?Y4g=a;CPIsV
z8sQY~vi>6iFWV0zhK%YrK2G-6cdi)6m~Oaa;kq+y+F|rslA8klShA(y;+_$j@*HRi
zGFoh>r|=wWxL0%zv%9P(0ep>JHt413Mr)8t1jI#gkO!`<T@Y&1I&P9Z)Qi#<d)yZ>
z-;rq~R?n4$zDW$?gVwjvRL!Ks$DD4!)TW4heGxat!i^EaQ!uv|+$k=DrvTj&D6AX7
zz8)X_+j>;BGI)Q0=7Y&h*Xb?9y7^f_g*zTQ(s+X|i531J1}aU^Ie;<WWESV)U$}cJ
zOc&9&+JfG<+4nHf{OUlmD{;G0ffV#!L}c9;fB9VDgwQ+hw+L%Ww~PSE07aNHb@}>6
zzvNkD#R!`BsPq%6n`vnb*o;aM36Dn}1ulPJLSjDU8D8X6LBD=%PF&L1caC!RkDM%U
zP#yf7kCvYbxH99am7{AG>tDxblwBIR5vXcXLNyX+EiXMXJB!4>jCDGc=z-#xhTkg2
zvox*Xz{bkHs|M_6_<QD<Cdi69@gGYC+D}Tx$M}!gY)PJi_(@Xng6t%U5EoAt4SG-E
z6zrp0ggJE~uB<1?UO<i%bH6@#3*<O};c=CPmdkx{fW4dZET*>iYHQ5$)g>gL2q=w1
zkiyW76wiHGlg<vFL5t`^yhiv#2|NGm+@Yfiyh=p5)Tm|{JJi}pI~)5|j+?yXez?LC
z;P4r98YA0AOa)9_25+P$1(_dfrD40z7&teWF4aPQSE27=iFrgUZD~HvMwi23vlAKX
zoRK8|^`gyJGlz^o4-3+?VCj@b7-l8{R1xjeOxuS#nlXnBQ|nm;S%3jU;8IhC6Q;bq
z1X*5Z&wp3dYr_-_Q-pD7n^?1f2PBI6fLR!r?;rg098!@I1l&gl7UuNQom;2#Qg<@P
zVRVC$s;65b(Pj%@n$b1iEXCRU6l{Ah4GG*f;VLhBHNoSh1g_qP>;%sLX8*8qB{Ae}
zk|XQ5cAR)1X^;5?tk=Yn7>3;3e!mKNOmIl8YY=Q=Z9#W`#5m!F9?D6(d9%jV5!R(=
z-tK(SaFfC^^3|AYj#iRlTtNh6;*JM9T~R8js<CvH_Y$ZhA^Rj57N&WZaha$Ms<q9!
z?HfavL}rgI5jId%wTF?z`EXQ?d;wy}sWQ)ynvZ+~CWvJ+{f+IO2>uk~npO{6gVr{*
z{_-Q*20SE7uwLh4P%Uln+jcO^Zf@k_*;#bo4dc0tF`V>+b`Eu5kGy|E4U1%e^15m_
zbXBMPhc8tVhRpG<!_Djm<-ue_9s4}u(=g)G%OU(e;|TQ|rC{JwW!uO{6v#hv{9X-;
zNZ~{42UBnzjouew+b#Z;qxL27Jei$?Tp-~Ir|-yWVR^Ri4s32sdZz|Z9floWs#9Jo
z!@s;kP~B!|5jB>SUjf966^Zyn`W(RL7B@1pi$2A&EMP`8Bjn4_QjzS~L>){bTAuy;
zRkM<0cicU!$Z~5#iE{X2)V{V%p$mufFJ4ZLU`>ph0XC&IGM-ud??CJ|!(|ouI?pK-
zb<6o03vGy?CEe9kUIS6{AUxqf8HFigJ`PoqRa?l-a1`qNo7112Oq)-ChLA2%>468_
zwG}w@0TRuofmY6?i~a&NK|D_d^`?dIoAf;)DDLxnLBWU1)!Yr#ZtO=|1?~qHsu7$E
z$$m%#m#19~edo?yHfgG>3{!6sxIVp1DUTg&l>wGl8m3T@h~CvUcuv|^&4~m^oU%xs
zifiKQfndDyE6f7ng4-=Z>lvr>@$ygWW_nv03K7!MH>WPgI)WZyUQ>3AE?}c^Y5fqC
z%_yIwSRD0%4d(iws3Y#p3}gqAaouXfljI8OS1GQ$nBVBcd$mmw!Bvc-$riva(nn{O
zRS}_x;FzOo9pcv8rp%hsH7|;Oj%5;&#v*XLbJi>SQj369Pbyv3<BR2?j`gAvkD`|<
z7AJc=rIqP%VW3oZ0w@e}0@oQeEF-Ac@mRXsLq8jmSOxJ%mGwF-aB?uaanOaVxq+w2
zK>?*g1Hh6<j1%YtBn)*IkT29v9vRlb5S8PJ<Y8IG)G3<GY&+3#7+mqf*4SI2>@-{o
z?D1agQeWT05)421Z>90$X5v1&@SUsjhEB<5NNq4Gt%CAy`tjSC#T80<nc(VWgt(!a
zXiJJodD?kj%pumkXfGzjaGakh8b+A_nh~PjyHkRBg#2hI@q(V*+hrX7-e7NPCcg8K
zdKr3|hpV~ePa=KxNTgVI^IgD~Kqv5wal?$tkJ>Yi`DKF?on{HDJUSw*d4usq{$T4s
z2&&=&VBY<k1RVyJq_yu5EN#AYu>?hUC+wJHIZ?@1gx;!Od3*}INGl1F9VlPb)%9*w
z(K-=4bt(ZPA)G$~L<G;i21H1B#yi!1w5mDLpq{6-W*Pqb!lOS7#I?%hOTaeN&V^6j
z<jM=_E08sw!Z!ApCF&(6MIW^mS~~yeg)i4QiqJ|6?!g52?&N?sv?Z{wIOJq!dnDlG
z(ieY9I{fmv^L^^(Zgj8hNee>qp<s+SY8##GBw3|L{5y_zL`@i`#>*ebR9dO8xx=R7
zRSd8m${_B&p#>tX@Z#yyoq7%?`RMh>J@mos^`BOqq~3)ZJ&UM*D-j#~wIHap$|?u{
zE!ptz=>5vt`ReD^3yR5&l$3c&sj=P!TPKGmV>>hY-DMPNuRV{m`=7`C2+qRxsMo-}
zN5``s^!&;jO>KA2*mXkLGCEi``AA9Q$E)X(oR<{OJ+}vyc^qd_nX-@Ir>py9j>HIu
zMp3=8lX$97pAScsFW6mB^VLLt4^2YWr4Vp+c62I*4evO4jS90-0`e#>=oQIU<hTSF
zEc};x{_VjlxbJr$RRObbN!=sPWvs5GHJYV-@z(D+t92hQElqzBMExj}d#(tfZ?}(e
z;!_R5LIKr$J&$sUSG5KbSzGFRshkE>2SCC-o<EckTZ)1Vt~ZZrd5q>#7;7ktI&fv*
znc@jJ^LY{sV}pBnEr*k`9^~SN_J93l9}OwR@V|>%o6cx$Z4EXyHqO%3*MAtNPD)?L
zvppP7Ied5(BXx(nJ?{2@;7bHJDd}ze*)4X9yIO$d7orQ!KirTtAmlQ{NQ<vX9c4t?
zV?2z3*+3Tr^=D=fy~M_t1=)<PXYEHCjuXpDC%Qs?`(>ftwC*_hP&*YOIGgrsYU8Sc
zt6wj}@)XoxEEeE)s3u4j^0Q!leG1I<HF2{3s<*vLZ->~nTy$1|iTmY3-1yIHU+)op
z*x~v4`5osdAp(gT;NfHzp}3<X$AryS{>S?c`e~zPz5)P=er4}e1vs%Fp&Wnbi52Pg
z@YWOgPtn|gC}!s?EUrYS)V!c6X(G&LDuLV}M;tb;K&}KbBC_JkuimtswLlwi<43b>
zZ7su9w^Q86EQ|*@Q2i=+a&Uwddp37l4_{yTR!_c2EVw1;bwlC>%gwG*ZsccL*$62M
zv5*&ouC6XJ<23`!+-8TTcrI!UB9-_xMuR5s0K9eYut6Cu^>w}~A*bBoH_$djEaB&+
zF3!i!R;M3B6O3&PwJ%h?<;vPNEj;?d^e@+6F^wNh7+0ahj_oyb+izFNwY4}2%-VEG
z<oP*6N;@sj%N-Bog&R#}$jl*X?s;EK+YjalsK117p-@gW_lz`H3@yi*5Z4Ue9sV|e
zlj==9{X-i5NIcZ2)buO&;J!7pi%P=VtU(XUC2X-{V_<ijlR;%P1rlv_SR80fxXgYz
zKLRO276}-GnfALgPZT{W>NeC^K>OhOS?oS}+eOcQ9(uY`QCB7pJ=^a0yU^HiCeU?E
zx4VYs{<YQd1%xq3%F4=G>g43KwZ7ij*yBd?w2zYW?n!Bnc`NP^7~o4WXEz__UBApu
z4u4eC7;G%BU&|m8iX6yQl5m@LX3d^2dp!6i+`Y@CXS3)lmv}jS>;AdF#1}n%({Pb#
z2;79Oy3j4zbHU8j{`t8|hA$#eni0~2B?T<_Mw6SDx1wA3A*HLkv~_V1DINh}9&Z?<
z?XQ!dO`g!9Y1}Ex8Gh0T7KY)NNg3kg|0w~Z*+-GPIr7{$t4E=T<t&aUqA%3S>!vGK
z#6Qv#Yxn1R$!V6;x9;%1uX~KU{>ZNA_hOJiAIqbP`KinFMiR-FvtvEqBn|m`|7H+7
ztPo8gjhh@>*2W78Pb9MtJ5z;GgQaNGJ??Mox}3GWu4|ay#B9Zg%_DAo?N0>qpoW)b
z0(5A$3%sH$)-xeR^`nQ^t{eYGi7nHchVRoZq&(~OSuGWvkyivO$^6QHohIabT7aRn
zXD{CVkErNYZTYeW9*4${&brPC#>hQwlrtjGd$~{QXUEfdWB*OTHtfu-ZS%>=!|MTP
z2-E`>RjPL9aXUj>ns_yRa!X<zTtxViuoBuCYr;P-QZRd>m};$h?}zikzAM$AT4$C8
z;LirzZ8xIowdhM}`q6nVRJ^)=qlrQzRdUa3AQ186=qK$LFps{v@h0pIeSW#@D8HLE
z)|rES@A&1;-|z^-_F=UfJJ0h0s^ues34I#G3V9k#O)4c;a&cj8V+K1p@4OFVtFs_c
z)?9uct@kF}ZZbMjP>5}5X{l}FsPFqH1cQKP&DxeMRdPGz%qL7l(5a~1p^B=A096T9
z8Hw=)kO{b1y+*#=I4WWPBK<Ew61YdKt6QB><QWc+D%3YT&YPRCzP`@4`_PX}oB<}M
zSK=K;Qe+1dU3qxP#$7doUTJl295cmCg_g79Ybc44O(O*y7a_-OZEbt~%{UjkdE_u)
zM$S<Lnkz;Je)GT$7#dc%!bVZE`99%Gng-wtQ4M#M62re6ICHUEa#9at8yh2|h7YM!
z8t;CIV#iWE9dR`jV!oP%5N~wJ|9bL`#?i@%Klb1pGODoO4})$}+p@%q&-VXQN*Js}
b?>?nCSzd^|iD&=Qb^wa9YBF_F=3)N_oF6|3

literal 0
HcmV?d00001

diff --git a/docs/_static/mpiwg.png b/docs/_static/mpiwg.png
new file mode 100644
index 0000000000000000000000000000000000000000..6776a63fc9a9521b5701e503c84058b0f7ecfb1f
GIT binary patch
literal 37002
zcmV(wLFvAUP)<h;3K|Lk000e1NJLTq00CbB00C$S1^@s6@XGj<00001b5ch_0Itp)
z=>Pyg07*naRCodGT?d>S#nqqL)t%10(VLB_X|#l10)*bHAt50+kT?l&U}N7&SB&r#
z>3rtw7)WALAPIrQBy>o?!4#W-NiZaQNo>HzblZTt&!@6G-+xvbX?G>v>F%`BN}B!s
z?xyVQ%$xsc-@bYC<_#h8k_boyBmxoviGV~vA|Mfv2vh<BY9(}1_E92GP6#l@l%XE&
z;4@D@{(v*ix^Uy7MT?^4)W1sUi&7~amHm_mlnVkKt&Km@S?s^D82fu{$A)JYE(}zc
zOW!J~ACmc2(rC-RmOKLY+!LsQ<U1$T5q9*NmEqG;X&EaO1SIn<m64KkN(6wwx(^~3
z5W;HAVCak-Z%)g2X&@k(Z)uE^q*5XfM}V@+?NBfxnXes_Qb<eYTMDBjX_N>6frSef
zVu74pD#0iZ1SIn<k8zZ}r~m|3zxDb}j1gJ^U9E^NO6FS;!z?>l76`C(l>_e=rR=b~
z?g|Vqi#}FRCnWQ&puv`1EnNhHExywjqeo@x0i@wk-J8Q-%5+cWxfcP+eBC=Xk}`>a
zfk0<x=P<~(zl6Rw>-(x}<jWGS(n28XV3t-lB()NO@<l-F*>E*sY<zD$gzfLyz1^4h
z+=qZ<zU~_nNs&Y#iNNixjeCRf&Prxvj||OzOWtuK0+RW<aa1Ht5`hE)5uG)_j&~;k
z2vumk;vF**YKS{U9uFC%hdlROz+=vnad#sinXkLYM^ab82n1W2P9;SDVG07etiG|m
zv%bE5t7nL}1x9BZllK_g^UXKMN=I2qK-?dZ%-8*6Bq^*w1o$EwC9xKB*Quh^QP`F7
zycshB>nWk{n(uGly#82oTE^XtfMmYz9v?|vg(9GbH(Ul2vg4B7r}TrNBX?bvOe2E(
zOU-G*^a+-<Ot=~W$$VWsJd(IdM4+oHFcyrp+1z(*YkRD=wl-|ZQ^=?0G@<%Amb6T`
z8Ud-GclGc{;wlk=$hNSb5w@SX?}|zuG^aDgDWh13lUJ!AASK^Y87WC;MIaEI)qFUi
z>__&_(kk!&?0L{ToNmdZO#1b;0OIb1bkgDO@sZS37y`i-|C3;<>FNVYpHe9KgU-|s
z;*iA7!pl1i>WL}=5v2qIbJd}~nJsUMJQ=zd0jYLy@7PGnDhz>+R{wW(Jp<$AIDjF;
zX*!s4EQyL?_K-{&3viSw#-f{Kz9wAU8kK4Xw~maYtKtv<bDci;PHPtpz7APKArO$v
zw-6}HHY5UlAP{Qx&xCY4Xy-Wae3n#0^wFfeU1S6#^DXiK$mS&iot=SQz<8(S0|}^o
zrR19rAPZ23WWEIeSvDaN=nVnAr{_mlI2+YlnOyHd^0ljibA42nD=-4m+M&RJ%jP5k
zW(2rt8Ku#e!F;{ud?QZj=XgQyhYyD}OlL2c@5A!pEO6Oo`*W~(&^#V;>~w$VALjBh
z?mh&hwS)V{L{d~C2n2(^F(evWXlJ0j`(V0%p+ucp-`u(~3mmU?wD_<_N7~KzQJ1Re
zirj2r2F0ausdjMbz(}&n8-WE27F3ZKy#oef`=|Ak(#Lk)yXKN!7_ae&s?tZ(o<#=J
z*jbtKWuChbkj&Rz;~}Z35Ck@?e|H)d*{(|KE~OvscaZn;$&&*;Y4>u>V3b^*V^LYG
zhzLmLTSV|>)0K%pSJ1aV)Cz5xdao$-JCh~_BAM>l^I|djHG3XU!{T9|j@HJE_cE22
zdBs6MGT-9hE?cc&1R@%p#C5ZF2;i&tp}K~)=k57f(+b-s(>>Hp8Plg`x+nA8gMehd
z?imY7Nd+K)gD$}sNv*xBkc9ty^a<W8gtK(}{Y+_8)ysl8yD%6GOvv<{%yS0<lKHx0
z93&MLf<Q-m;~CKP?rQEX81C|k2iJV-lv4sJ#|`EJS>jMHoS)?unjpgzttZ_<PqqhS
ze!&rt%(vi>%LXeIfdvZ!Lz$+h>3VT8(64gU^9t<vVVN*9s}|`@q0W?DGUdxWcOW2{
zuRF#;Qc>mzgl08=W#hW=6X@nw?OmnJv&f#;SNgS=qI*KT9kefqzS@!ZokO4(osn~m
z%9<qtm4ZM=M_>;Y4&Op(^jfF_GHxN(sdfeMXzdRs8qrVcPkC9^eF#9$FE5FJM4&GO
zI@|q6>x>T7b#eqG&I#GdD+>81q4YytV;5no?+`Ft*xa&ScbuVIln`aAS0N8gZEXL<
zTsKU{j)LvNFrHV}Q2jz@C)q{oiJXJHR?xA#xvV{odSCH1hvbRIc3@nRmdw{B10l(9
z9|E&mnhwy3HVfxNa6bxQpv+-49qoqxcNGkE|G{*65?;=o%<bZ+_)T#=FB_T*THMbA
zrRoXwO>M8_QeKuRECQ1G78Y;WnhOzNEMPFz&i2S58q*KpBvPFo2S#@T7;Z_c=J0~A
z_l~GJXzJ9!CKp0q(IiRcThYT`d7ba<>>Q><H|!bH$+1ug918}!FQ)Hjlrs{Lm={3;
zzOgS~5#@}58~Y}iuNy~2(v(*Oxc4wcBUeCYdoff9CqtU;)kF(&?-tn5zQBm`k%bV-
z&c+UjA?7@6CL3E4(lMo<g6`Mw!aw3s$&#-g>wT-gcE#p?%lP0Ckj!`R7|I%4guwjy
z^SzNR>vn@Rz{OlY%5-u8^r(kG${o=snUww;deR3}ulLt8W(3yvSsn#C+ZqnRp0&Q~
zwpQlRW8?xjz&Q<4_aW9>g-M~G|H8}vuujNJBgfXfa{cvzZG~xBwjvOa%vT^G!;(c{
zPHW?Mm<zxPLBAZ5E$nF7fw^>sFJNJDYc-_Tzo;r(G+}@5%l+N#kV`w+$}C(Ms9wD?
z3`gqhh-_sFkq;*DJ}fl2|KPb&HBkt}<84UhTR5y`OL;~h*xGn8W%^~1YH@B^CJO*8
zn^Cfy`%?g;eU>O>vmk*o_2Fto-()xK688%g_Ii><Rw8H&9eRWJD|->T8>H$zv)<2e
z19f7uKp_#cg`~RXH}&;_k24gLS$RSr`{3oN0a=YiAm0dtS{u)2x;_I8_r$DqQ}Uok
zrMJ&$2rkQ7#w9mGE&e})kzSG;Idqv<Q7)a<)MnL3+a8`jKQJ=7B|0j~^s~THuYg_f
z9{5Oo!B#A55{!5=W&|(6#pS$x4>)Y`q)C&stcB%`0U;on?|@*F4;2G}*)9Habix|Z
zHn&vB@Y@BmpzM_@s?P0;ITeHcY;DeN^?g&*aaLOn7bVI!eKXttkV8>R5w7A``%ZY&
z4vid-kJfQmd^`>z;(cFW#BXBe0F^}Wp1w=V61gIf;Rl@Sqq3Ytz<C7taXTB<zgJ5M
zS1-_`z%Z;E9VmSdQ{|s1)zuHy)!q70e_S#;PknAJU>Dmlx~{#POKFADOTh#m%B5^?
zWw>qg?Ae5GeZI*!`GT?Wy_GlaQTl(7gm12IY=5@Tva&#*2uSAJCq%M9F%aOsXlbnH
zW=xqchaz$hI}MP=qA&%U<7;YbEJcQE?oY7QcRA6?LUUfWc)pmgxqhxYp54}TBxBmQ
z5WOBBxZXFG;EXv{p|{RxX!}>GV{gWx&~eFp3yrvJuU7;N2jEdSq9NpmS(w0TLX|&y
zDZQ(dR14WUy>Q_|b=8X3m!Pk?`Cy>RG=FpZ+-&s?Hvjf^|AA4B{Q~uX854_5v%PLG
zk3ZwX81&B!y=K2B(|sc#nQz}1$r27Dz;~Uo=!UQ9F*X&N+%q!0^pLRL1sR>y!*;%R
z+O%mq944t?)pxY`FVY!%IM)W@xaXJkO`-4ST7IDAjG0F?JO$&y9Rp3wtYc5qPU#=3
zJnZ)A{?I=L+Fm6-Cz)?0j=D?w9GvC*DwyhIm_(fm32H(nvaryzff42(Ij-jZa@z<1
zcawYR;qcR7ZePsQ_e^<Cs2wu3W6qo1)^LQb>r=7Dc{;|*atcSP95?G@f9DHxLq_fX
zWNmG2ICW3PG9e(DZzfb^o}&nGNmmPRxDk5JdN9_}nHqymcPkds{y?j}vtV!fVWu*r
zoX1b1cx7q$J}@)x%Qxq%7|d4?*VPpm8|e;T11Weq=-yrA+e0vwCmE?~_Rnm2-F{D|
zQwT`ro5DiI3V;AV(E*Od@5Bq7)fqUXr>9j9xeYeDIavS{fb6_B!7U9!Ej>R&r9bFz
z#l>X4g4Vfn1EYJkhHrzUI|bwD75U~6A5TcW^hvDU1$;AG-ZbBn@i+pK`O05HK@i|R
z7q)L+TMLP4CZ^fLGd2p}3WTUP`I<a0QW}8Uf%3dMT75UbDf=A*>75&xub_KQyZ?(Z
zjZ6aro{TZIngEJIguX+n*qMHRXmv76MhpZb^OZjY#}NpH0{f6i&phaLFV5T;jo^nk
zXV<;GEt4k)dNP+OkK8#y-;vQM{X3Xd@8(=ty6Yz9D@g2UY5D?Vu}+NdxgyUJQo4;Y
zGM`kBn(g!Xl3TnjWhyWsnQsLSdLg^q+0wLcjA%<B@$Q+aT})3Q<*GAq+QO@u?w3>E
z+_r|@!#Z06yW0H*+AlXUU*4CvDkX9aHp2wLgfpI+3}*8&RY)7D8uPpQ`g-W}EBlhn
zx3b6IvEIiQlwv0kqVYwgG{y@nhgHxC&h<67-{lw)E~+?T&kMCm?v@Ri$agFA6|{1l
zJ|nSTp`S;Jy!4PRqbj}C*VqnAkIKEW=xHh!5ZUv=Bft&ymn{uHKv?X#42*X*Y*nux
zv&)1pR0`weAEMPOU&qFU!7<*!kBsv*@WnOHkg=y=(R3C*u37Z<>}aO3r{GZN?tAVD
z)HqL-J6n*FuRF)3B&p&|X4y;OS~xIo1EaCORI4HF&K|Ri_jgmK1iDMIg99L%`=;3V
z>AOD%Cf^&t<7tn$UGg;(9BOMgn`rvoP($ox&bP!V{iI4Ir%rDOuCU}+KteL#3K--<
zbg`qY;S{)eoQ0Kds2WliB%n>u>rNj&*89hD+a!_dW#$;)M8U$wLgS^GOY}dtl$bB?
z9d~Wny`^Wa;iDtVX(Cji>#!i_uQab|lKJ-k_b;C?A#i8Af7~_=E7^oy2*x{1+ycYJ
zf(pIAnkp^j;DbT*E0@sl7}nPFO-Q?4sl_L(_)|6n7^$KTynJi1eRmY{di9|o%JZ<)
zn6IEczFoQJr|9oV=HH39kKYTRqIc8k(K9M#F)sN-5Fp4<B9JQr+%G{yXMe?8a}1uT
zjz34CQc!+@ooQG38e4z=^x`EIx2j#E9y=clCImc;j}g=@Z;&eQ*Qu`Yt+`*}IDzr5
zQK;u+o!~S#Vx2AU``IqkweWAxE&X4%`)<vD_Sxq*f$ra*aq7u`WJHHbihc>Sn$7LF
zAow++^-MnP8z*`%zv4TuJn_U6S#-v3?PdSyl9F%#@RFs0!0eW$12m%D3Sk8<2ss~#
zUyxoFd#L)o=}m3#iM&b+v4{l5%Wf3?1}6VfYiiDj#d@M7M&HH)<T&JMs!EUNr)-2;
z{WF=)ZWHDE3QJY;wf76Bk(>lIRXb)($J_74({x#tqFhnVza?qyka^@~ZUzJvEC>wk
zXz}?q5?cl)a+P4b;J)vJDO_CNRC9VczbqLbv1H9|@o|rAKN6W>vTt(6%TGe4n&Cib
zT;kpmal8@A_hOPIYAU}Fewbg}@8H9~&QIpR;hh<dL}EwBnD+GTt&Mw^U+?2RlFYYU
zM>fy>;`W;x*Y&)oGtv(2+?aUv+>;gkgI86K!D7iH<-8@hFF3(?&%zw}F-UUpsWeV~
z@C>Orl`~%Mteog@mzlW0aHsPq&&TXxyvWZbR6Xp2T*{R~8GbGtKN1P+0>|P5p5|5)
zI2{zx^}ltsHm2E$mqP#gqEYI6`$9<GC!K8#U&bj1&q24llYk>3?S9|a9BhGJcbmws
z#4y;>bSfeGqhJ<>c^x?F_F*bLVrqTsXB>~Ap2%gu?m8Za^~ZB?KF;;Qq^2f3k!aj3
zFZT<+eEaQ;C->TmQ}UU`OSW8o<#!(H*|B*WeplRDrpiRgIGo#e{<)`}^rv$Bok_{p
zG|DoXjsR!8F`X^8Fka|(u~>8(q}@NIKT%mZe9JH;v3X#;35x~VLlx!X+J@k#0tw6(
zZWeJc<~x(|XjNt_2TC@IGWO7rqV1Vwke)JSO4QdBoK31~4#sQ>vnYI)5*M&NAgt>z
zaKFMLuN*?je9K`J9qI$TZ~a))biPY%XQK(u6mX2qu6xy-Ti4wBwnNR9OtrIT2X^b>
z)*dWw!2r3o8{1|t!U2<uL?^J(;7r87nn<rc>I5%T6Y+~t?HmyoAsEjcumSj*NLO0u
z^9A0=Z0cfsz|Mp&IQaqN3>XgGU%?61=W$cMa=@i5I#CV-;if)xb#)C1w)pSE4}QLo
zcp-@_rmX7g{QR(T_dJ%%Sny3HdbsBqe3%Xq<uPUMGW#pwyW6mh_BLpD4Rr)e@PenD
z65ys~SuVPTdDyMEnN{-5@?hCz^AnPBNatDTSke_GevefeV~=;VhUal#8KvH7$$U$F
z1oKQk_sbC3zW%j@HADOdPU$n%!*)4OI>nBI4aWNoU1LwABwon8lrC3?j5Tz*Vn{ZA
z_TCv_yhrmi?!W6DzSHl;JtL3kU$&$vdjx)Ovol^S;-MaV4=dxUYiwWc88Yq=eE459
z_ZX^|8*%7pSE(~#$$ZVDTH$eSzZ;FQMPMpl5Kv%&=-J)&@?Ka@hgOQd4J4d1C$JOj
zQy&FW9h!U|rc!E+_v{%nW^5OE{HP>YINTx9kcwN(7%p5GsD`nFMZLh7@`5OzDMTIb
zWXdn+JdP=?9&;W(IuBX<$!b@xczsSOGhoSlts`2osbJ7I5gR9-g;=$Z0D&{!>ft+|
zFVzeJ5kg`+!hgZX$=JCT&!ydpLchyR%2K6Z`_S`wEhMykjXX-<Q)<R8G~a*kJ%Y3A
ztX|GWco>+vyzB_@?Z)*@?UyN(%rxJ_`F}UBTKW3#_y^IPUn+6Qd`o4d9Hf)$Cb9eK
zMW`B52JKkvU{&mLv$W<Q2_;cAcB9pSp<1L>Fof@N{iL}AT&mRxP403dj0Vh7F)thI
ziPV~(!PNRcRpTaXG3VLiVEP~0^C~6H)h+eSp%&mBfD#8L)rpI-Z{mZvh#&4LvPvbC
z%(qlV${{+r8_138*DM6%O&PMos(~`kH+t1SIz*3KtL5jF!BNg_X8OP|=VmguTeT8S
z<P)d+qRi$Rm83Zh$F7Ux&2@o6ujl4gmONVmevn_aWmQ}f4EQ!2u6hy}aPmWi58`2w
z?LB`hH3lr1Z^iw5Qyu0K+Oj1*f54CTqGT?m>u?t0G`YBzM8t^Tphz%YVYOf^p#2kj
z*;=fn;wegF-x2v>z7I}qZ2v^0d3e`df#Hy1Pd4WvzQmT6$t(!eH3VORDcZSK222i*
zY!5HuAIU6_xh`8WU)K#uVF==<39num{vC9@mx>m#ptBykW6$=_Y`<6J$&e6ks|#j%
zp1=ZA;N>t0J0sJ~fcw9|_UrG8vWzM@ZA4Khdr3TV%rP}vc^e`v!*m4T8)9jK0dEs`
zAw(Pzjge)u+x;hsJU54u`MP;@9Hgy#^V%Qa$2v(g0SRq!wW1#3pV{)d$de%$FHT;D
zt?s;()JwMwRmoYI7%#_b<2r057M|WPix{(0b(%!GZQBrh=S&x@C0O;CmqiY@1x6iP
zMOucL5lAv%N<UAPV8eBtJ;}YMrSjYvlg!th<Kie)n3^37pQ<MPC4E7y8FxXYIX&+S
zWNt3TB4Oxj*}=RlK2Tc;RnIqX9Zsu*vYSL6q}j(a9+?!`bh;>$2;a0j5XZC0U2?Xv
zGHF79dq?xsc)tjz84rqV=;lYj9Pn|sOTd!(iV>@jkRMnW(-;=mlAg(47&*>+o_#gG
zLg2{SO>RG`^=yEhZk+M*f_Pab`I}px&-PGU+G2-```J_uu<so%%EUwVyfqC|HR4IC
zA?){L+KK=-^BcR%zE|Pn_}kPSHd2d`C*T6LF_q`8m}I{08V^UQ;R|f*J`6vADQ)~T
z1?(JByra*rw6oQ60eSVxH*SDhX0j*?o9Ff!O*PMmbjFbB*C$mDgswRHxN3_DTJjmn
z&QHb>!M5!u%xNB1sy!9Ap~8+qf70Ai<meM>uEq4^cd6&tNH{fW<qqppdG3ix=IfrZ
zaFmict&QVXujpBrv}}MmnW88+*{8dX5>X<R^W%|#Q<or@Z#JoKZg1+%Uoqd74aR#x
z>KU>ylOG4eDGX!$*rnuE<%y-4nAO$Y`Vk7emMkPAeImf|_BDC00Rw)(T$rB}0$(Lh
zc64-%Fy|LLE}3t!N1y=hceXU`8)f>-uxz-HH;3sN!!b4&WAFSM1!%scngHXy1Usi-
zp}>rH16BvGlIl^HawGrVIug-Xt5}o+zEP#d`?E~t@4kC?6&39z?xF0Pw=@Lb5@`{}
z+cRw@5I1DlD+2rjS)<Tfuo>e$OF_oYVUbNgvE&swA(?NH2f*p(;jdvojm7xxwcX=w
zQOv}Yde}<7;B?!iRzJ6F064#a1gjdf^Id)(c9zfQ+uU1UI$D~(fK3vxJ55}SF0HEz
ztQARfxZ%@xQ#RGO0mBGtzAxf@%Pjc)fg&x#+!5e*)i_J=M4S`yuJtK)4O9?K*1Y1S
zB=apE_KvsAb-WtWewC86AW2cy0Jo5xjyGOn4REYTv<JR+Sz<AaZc$ZoaW<1OYxiQ%
z4VxV85PGlmekz3*4xY~XzBG;0;3F`q?~<}aCIoJ2YWqMX>e=|VMZ|qWAJ5n<oIG-}
z$SY<jnQt-Ccd%`~iDK2tp8LQQzRsJ0d<7wkGu}{#gUyy~)vH%TF2T!rsOSK;u%5$r
zQi*hLA--s5r|#2j_0845J^kXd5VQeCEw4f#P~Y6Tk}~=osBkO{7$=WB($U&z(YqId
z9(x;-`PvaFlXUl{wU>k4UM{)=>DMnAuRy~Z8g3$q-j>XzN=vSc_s(|zILg>A;#uer
z?##9@rxUFa%r~ym5D&=EQ3P;6Cl;CM6mugOgHep>_k}{CqRuQN^EHo3nZ%7v6qL1#
z?%>ORzht}ujr1_q9ljB{6OB7yyfECqHN8~koE?Phj}M!{d@;M&>>294JyXf~^8+I>
z3%Mli9wVQp<spN`B44rRgK6-S9pu$d1jg<<{`YtX?lC_GYnJ_3WOFORj5>F7S*PNX
z`8tJck?O_b&p}a*@Z)XvHb}tMOOM*W6LO10dQqCinPBIDc{U`ldcxOK9f(^R<VFro
zv@p#sl(y;91D|Dja>wTI8C>E@yGP;5Gs6XG`hX1CLo?*btUeHk+W>oiKoxSYwJ5ux
zGuTkHZOD@OT1TU#Q+zutrnb)}8E?EdPfTwJ{<ow%<`D5+LErv5V+$Z>dLXsFQZw8O
z{a`@PwYj--1EZnuy_%N?!+r!3^PFtY!hG$7O!uIQw|G3uR7U0v9szCxJn_IIuERK6
z^zrz3KF-H+@GbPe2FZN;kC`i<Ft)?)`1}Dd-aQ3@#v&W@xXQLs&{ph_FMfsLKps->
zUR+&3sXBG))WD|x&=uJhJ{5TH%yCv!($yb{IoS^L{5=_$5l0c2G-;An-xRzWU&`N$
zhA<l$(X%!Dlt>pcl+3q~Xgk=}sugd12aM!=TNBSoFJ(4FkmQV4PuM;iA1Fn+s=lf1
zmGqL?bGWV$Ho-UZ5_o}^V}UNsds+6U#U1(<{3icPq|70zFPbjWOh>uJ`3KV99Q-y^
z5Wi0KjU5}B<qxLPg^5Y#TNs=jZN)J1rz~V|1h4O(?Rk}+=BfsqRree|RC^eV7kUO9
z_x@ymap~;5ZC9{e!*47l%DnzQZ<MDr>50q!Jed1Ldmhp~8S-S-01)ssg|1a8P3}U(
z@8M>6LHqgusv7VklKBo8I(IzS)zvkGvDjZBet{XJE_~tN)m38I<DR-(nK90Ip=$UW
zu!=86GWxu)DL88&Xz1aH;e;L%)JCsLGCR=erD8CCjWGS;Y(*fT9b&(SXNodCw$o8A
z>AL3Nk1@em9`6sU!N!&cx#?h``X`xhp^$c{o$$8L=HRQnkGXozc&oFVv|uh*NpYwe
z4v0j_A~4?Zyx(ACOR0DCg#*EW>(?Qf)*8?1WU2j_pv-krn3|<7s1=s_XNF!&<;Yku
z5U9e*(`Z*S+J^pqo5ucRI6o=?F_QTf0A#0{;Jyeb)%lOdbkPWuh0~__L((N-0u{d7
z4^|)x@sXOqv%q-&K6F&gxdZj+wtB@I--OB`;di0$V}cB)?1Y??Acs=`BMEgt5_hT&
z8OsL(GaB3f4?l=Mh`OM^Ke?kla;8W-8A|5sB)SgQ%_0#$ek{W*%~0|;oT&SVC9i@L
z!Ju#A_RSwZ0ZH{R4g(nP+oWprIkmNc<mtElhlgEbJOjr&6A!LVzPQ=y(IOj9GOL9^
z!+UVbzBpTXexfw)jj`WXYrKE4=gIWo5vUsCy%}SYa_gz<!9uv7l+1VVV^@?KIywR{
zk<za+H*WlZ7@KL%tK@hv7??m}bTPJBpT=<jKHQ}2>b|^`^@dqzTf-rcpieQ%F!u1I
zNr9x_fNamfpKM%3kt{%aHUE3%m7Wp!s2-Xuo`^6uY==EhrU#Dz-xvc2qBn?|IQDvE
zW1~nr6-wso6tWK2iv=IPV`_w@87v@4Jz^XYNV<?^!haAtUY?87ZZGYxNtp#-By^_V
z2$h5J-Y9C{Du?*C;`qrSqOK}R9}@9QA=Q<m?D^cGyFBX2j!OR0-XXj26EIeb8lk$Y
zh4sVnBJD&dnXeP5I$S4bEU@dp(%KAOS(VmV^C~yh+2#j6>}ZYv2Mf-%o9Y|e`)!qA
zMsW7*z;3!uzh&f6`hTczk@<e6c=sl4JU#$>rtZ;~xx2$uIz0vh;=R&yajcuX97AB*
zv}rr=G5rA!;>5%TzZ^z}VE5av5-K1^+Lg@L5m=qAbas2=Y1poMx}_CLo{=Y0#1S!U
zc^Nf)7IVf6DdyJt=1`jjuK^@jID9T%OT)i{Vw`r-cO{fYFR+yA#kN<J!`ttu#3kC1
zp%nrCaaKsm*9I8yLCcm#rdUhmFD038{;(`aO@^71#%9<XQx)21&#S~Vw`_>S$gjAx
zX=1!B`M?4nJg6M_g;|E8{5c-){XfhaPBNEJD1FJC&KNhjsLXg@W;%ud^?I+w$2z&0
z1+&a~bK4pcr@-X5S;>6!i>KqYty=l-6YWwjek@PUXlQ%g@y1J_f$q(p+zW}tV0a0M
zH!qAg;e7jx90CZ_j~XUQec-|Mz3@)<Wl;<tgY82`?wWBsu_(hsD3w6E?V@C>Ek`CD
zK)^WuiF)dU`T?Wi32V&QMxO8HB=gNTq7K%_rC!RkCVLZ}Dsmg%S3_52zgJ1=;4I${
zz<4j?eTI%V098X?81Fsz1aPE}_Dj@V#p_m8CBVbJuJG2#1Psa^sdDu3+S=N1D&4Qx
zGyN(p3lA6pUvujnlq$<a1sKEHuGU7A-d@}pTqv3E;P5Nx)2mi39}lVb8`ehYJJrMX
z__sB$@=~D||Mwx2-o+&qe2DI-8{oF#OvtVMF#K~c-Y@ZckVzL$Z)kfnQ#pH{rduT@
zh_esc%VbK!vf;0p@?~BD5%5%Bfz}d>oUA6I>p#eM>yr8A8&L=AqfwgBIi?y>6c!kS
zG?l8Tgkw1F{aP4+{FF0Zu*a83)u`rt%><xs_!`!~uM+Sus{S;Wy1TjpV?oci#dADV
zhj?E!r?bVO*67Z5Pv#c}0iQ4MK0dTbdu&L*SMnK9K3kW}H=k%aSl8{XjeFw&Obhb`
z6IfL>beCRjMja%{6;%gkHCzMNj(0;y@jznSRHM?%2eXO7-YMAiPKz^jeuu8u@4%|Z
z?PW5h!`mVk2{ncBB(^kXJLMwN^Y%PUgI}}f$+WWwOgyN@hZbV-R@i}SKZwN54D-~i
zWWIR<(&=iVOq+_ASrzTzs|?#!*39h_MP8+a+*uA7BV1_$3T{%S(#vvla+4cUi478G
zBui8FXf8Y2czH^6L-#D6Wu)&78KP{aQ1Mp1pDADFIfnorYX~??lPyrznor9I$$XQ@
zx;Mh5UQDYkQZJ^$kJL4VD%NG>oYuzihC_4$SDM&ojlV%}qOku0BHMcog2`Ia#bA{u
zcmK#~77LWFvDcq_n*MkEc(N(?^0H*U5Woo~O~&twSIF4zVV%v(TiyApNY>*V0x_b&
z^&!`1#xE?~tglv7D>D{1D<I^yyfIz-3G{HgCA#|z{PMQvyX%@plbanl=4wsN+qt#@
z6El#Px?m=;JeBG_#t60R7B}W9ara)cyxU&{a6-vT5OnSo&tX(A!y1ESRwM59H<XfZ
ze-SG}c}TrhJ3~l$)0)~oDgr64XcAt^ZU$#FeDOlv@VrtzdQ!d#Td-h3m9CQ?iFzT)
zF3R<cH~uX24im+*{S9H!Vsby(-=}5yd?Vmhl-ZDe*Tw5)3dX0MA7U`rb|mv1EP@U{
z-_h0p(=s+8)ePNMMU{#*Me1nv{SZ=U8^;5-yfbO_m<tB#mp6rJdTi6DYpw+D2Z}p*
zQT`<2eTADhe=-gd(m{QdllMyl0YlI+c?-nY9uV2~+4+N|UNYaoBIxk*OxGJQ*r|m!
zLYDRAn8jg|idH>rd8fd!%^jEtYzOO|%r{Xu!FYKGA@yFH>L`6@^ez)}Kj$sdb4u!2
z#3=o!dSul`OP)-W6au^LUGrO5oR}8h^jXFXXAr%R%y$srIsGW4-qX>>*DdWRO49bw
zl3(VD#O^h&d+Ua4$aDQO+wXO%%b{7#heGO2;ZE5vr%ag=&9yy^Q8?g9U69n*^~ZU;
zU0<B4NydtWz~sq+9z|u?U5dqBTzqD;Y5khB2Sbl!zJo!};b(Q7O*c!u82ERNKECQN
z4mazn>bXn5P>XLKbi3Hxz;>t}a-OfLEhT|CLTxlCsztjyRN~25<-nhV)thBO26X4V
z*e2SQq5BZP;@b-t+lK{Z;KG)17Wuh`lKJL}RDsL!lPa*vdW^YQ*c{F~<&;3e=-Yg!
zJmZ}4E?d&G5FenM(8F$=dv*?9fF(}$7MnJ{hsC_4M1%Mfb<M4-a&0rz+IS)8^_t7;
zy7H1a-B&!#Os%g6<o$vn;8E0Dl8rHPDP~E=w)13OKO&O(_5-X!7G=@!_b|B`DcS_%
zeb?9A4&Mcpaq%O7z<A#U^Sc<*C-@Pmb6Vo%r`Ez`*koBkd#Y86sb=mM9Pk`#F3o9I
zsPeKoudldNJM@K6@$OG=YI_fJoB44H;M<wTW(|}Y$$STjoP*EZbyr|Gq^IjFO;Ba3
zC0)h|!-G^T{5%-%UPd=@JW_pA=;uy%xuYfWZQ%YD(GD2rzi(*>z9rH*g^XNoEBEjE
z=GK+Ayk3)1^6eF#qTi&|-rJ$aH)f#ty<EUgI2j00lKBn<HHV+sx*>cOH2QmsX21j%
z*EhC4Ceq~`E?gL>jxc==`aj<2Cr$_^)uS$Uy3f2_Mp<%7$t6}s6>MG3{hAZ>9f{}S
zbSECeOPZ4>Gjl6a@-?I3zIejv&L88HF;D!2lm62wnQ#9wbMy&rybX!ZZ*Bx<vjw1b
zKz4aweCOJ#mEn7_ISe+onB3vCVUJ4A&!?lDYzVG?D|!sr?IEfA^mTtzTXJzFm7hHp
zi_%LV2_?+}c}Xk~_S+Aylzg*e?3O!>RV5c?vG8^?|LyjdHpzVZi<raZBi)}LnUwm_
z7MSioXZXGSTCj1LL`SOUk|$1~_%-_eP0;{8I_J)42rd(8M?+mR>3R)|g??E$aN6*7
zOm0W!X-K_m$L&`0CXdUjyb$osZ21qycZJa_oK~`9%g6n;e2~o7MZXEY;6s>po0%lc
zejlPVVAvL#^UEpDjn!AL4BrEZH|equ?jh$H`|1nOQFuq&51jHmb5n4l(=T;FXG_z*
zc>b&AI>3w{<MyHEJQ*(^1QZ2McM=z7JU<rkTtmrxb498E<>s`;#$ymq7ENK=z12h2
z`$W1NLw+>U#`SBssv*gEAv3;3y`yIqs=qqZ`0+zkqCMdL`ZRy&9g)r@)QQG7W*8?M
zi}Kk1yhg-x4GoJ6LUzivyewBd1STAK#3T5vBxW}B(8LEgXM!a2%^51^3u;WOHMauk
zZ)&-3C0@Pq_4_Q0my(S*8Tcxvwhf!{<#EG73T2iBJxnM6Y%kY$nw?-T0|&$XRu|oI
z$Ji?1ag@C-nRXunlO|2lU`W`?DZ`iz(}>nDizLZ>T{5Oz;=wdE$$Y_h7u7chf8`SL
z@=gZtZ?MIGKYp~v;u>!NQUqUX3e`8Y{WI?kW~ngTz5YB%y$2+3Q~Jq}QG3iurVAX2
zZT~oF<szD<3VBGxW#}#hd^6h@QYGQ5nK5jzVfc3Ec1h-&8%~awn%x!#<7J8QpxdxY
z-t2gzMQVUMhYz)cUo<bSaZ(sleg?n1vkOI*u|lm=Lvl&B$=F~{7*1BhKO~BxYsrip
zBax``HG5IK&pE%)_Ge|f5C|y7Vl9NHeeMW0<g7B1%(oC@lS3QuqLo@LrR33qt@7tU
zTRw|~LV<m7B+|2BwHC_&$h|L(99LbR&$jxit9#43vp`4EZ6s)jRC(y!zMji;9}dt@
zwB$3ou}}+!mUd;L;0R1S@X#mlc{CjDZ`T-f|G8a~`R0a`qoq3A{YRsL#I{tbV;b{2
zM;k0s<(%=7XwNg2X+CJd4;Saqd2YP^`oOkAk!DN}b?v*B_KaiUgK_gA%Z<JhsLv|#
zz{z{xPstMHfPkU<CG01>FF02*sYdIoZ<6`;g-`+R$71YzDU}0T<7;exx&X}<w24qC
zG%CS(4>C8*r{~b~Huz_@yl&2SIzDI4Z98L{Xjy1e6E9GI+cvhBQZmN;93<hto6|C0
z-Uy5sJLEUe4Xt6iZ3fg!w<YuK1)M_M;tpgW`+Zxqr6|<+f{JuGgxotBiEZkF#A{`|
zgl@p{#1-{T!Fh%5XRIfZT-@Ys<Hh;7W>n397^dE%?TQ^1CgzLT3#Bg?x=*sb;vv9i
zZAujh;-kAyM@PqqTqu#uHy4x~EJI`A21vbzeJG^Bs7jPa9c->BRWDl-xrFHi(!J@T
z^mFWBJKf(D>eprlQw@WP;%Ilvjy8zji~8g6ytl@&6FdqJYy)ECnd()D@>HgLnddG9
zsAt^oxzA}D-FQYWlu72B3rY@^;W|)AgURi*kiUK&cUk99gGHzoC%s;R0|$R^ZyFmA
zX4N&eFSq9xA`O}*1NUL6wqRXC)Rt7bpD|V2t)EY1Ee;EbG|L9Oq&-<5kT+a|z>FC)
zwqpjg6sm{ob0JDH-&{~~s!TKtr`>FLvid4-(d<i;G~_YDT|?@O!246`*n6XTs!WHf
zA>k7>j}7Kn6=<4nE(&Sok^VSLH$Q8{HFlgWZ@}BZciyw+VOw1?SwaX@D|#yi|LFPi
z10(w)TQc9i5OU}~9~><Bm~46JOWakRL(LVuTE1;Ktm})@7%v=A`<vTuFE~Mo24}Z5
z9D(S8srG3%J`J~~(*2J?Z+(gNDLP=)(Xe4;z>M2UgG?0zfoV;%U&XBMweGEvg!NK(
zDkSsGj*X*tx;Gm>d-oKLs0zD7#LFREz2bFjUNGCx@<lDW$TMUtblk<cxC0+eXF<yc
zfszOP{?O`yQj|FEX@c$f7j5aBlWW(GO;bDM{ID$O3Iuef`~jwi=k!I4WWIeN<iLI7
zqzaqQURE`xY!+3ZQ@x0o9Fv3X>W~Izju_{?#L$a5fN3sOZu<1yYq5<kxfp^4&hVgg
zy=at1Vne{pQ>r6EyN9aePr1+|%akVq)gJwCRA-m**<Wuxl+3p`bR4?3ZByiEFoC^O
z)zf8lbvw4F(xn^g><sJzbqY5tOL+-XNMn6-sHKpbDC`Itbm_Fsl26G4gVOb)GL6No
zx;;w%T-y-*)Y6VjR1^Y<qo=Chd;2x)pv-<r=9?WGNAB>iA=CAwA5fza${sEkpS{|S
z@Qs*q?JK~;ZaqKzHT3H^9`S5GLxV2D#)>*Iw_mS%$K4>(0}au?Z<wFO3nisqGawZj
zXGB?|>C|G*Xoo^FUppcWr`N3f_XJ3VC!3p5Rl3xiF4Z`WO+O#%hNOQ6s2UpFz<7hR
zd|x%wW#CI`Q$AE67|TP<<27WI_jz+##w!y6SkSD;hKn>eqh>TD^EIPUfVfUI`{}l;
zJvBQQ#ojfa-Mzite;_t4+$GcvkO&*=Tp;oCK4Q1oG(nfqXB^5o@u0(hotOSIvVW=`
z`ofJj-nuoF&QA=E+7Hi9O|q6kA>g43U)WVT+8Qh~BXj$b`I^xvKpafK?#=6oY15{`
zu}tY+{1~2yMi%Qj*JINys)wFX-`E}m(}ViAxEJ>+0U}N}Q-OuHJLAq!%(;V)L(-q`
zM4xp+lWLs7Qz>gG1OmopDMr^YP5*he_9XMohKvLG+{Xcq1^=44YN{wj*<Nd=VK8y7
z!-izG5DSPfo1@DVsvg5#DLTP~20|x|ZTb#q+d1_t{4OvxFPIyi(cs9+s2kEWs~Yue
zsvQ}tbOi8`|0nD#_e*#!OoKx*-!w>^%UJ$$cp{{*<W^MJy>1w)=qsIT(b<}fgD+Ps
z|1+2nHZIU*H6y%d)i<~P&)HUUubneTU8j@GPvBIDTk}QZy!J+nB|DPWj|IF%g*f8g
z+!q;lHv;%g7`qkWwQyD@N+t8ngo;CX(HK3<Ts0W++of!lv|;^wH=_?1Ldt!WGv29F
z1DnkKD|S58G81^S6HRoyHt|4j-cHJhR#l_>@rzL<FIEn_?Y1FVr>I+orHOz_m486Z
zhz*d|*{<k~WWEB40)%+0PBAwFDQuxRT}tuLtcEMGzy`f8S?L|_J=1NBH;!fO8iA>?
zp+Z%zn=~m95qX0RwQ%fUUI!$c^&=Gh#lhCcpU<?z4XSg=kn;g<f$K2zzLf634`fU*
z=$mM7UNT=hB2K0$n~<uWMygfzSSnpQu{rJjFEYyJfZ}erh5Wu78E<G-^H(7C)(TX?
zzJ3EQYHx^iK0<ta&<2<<7VvIp2!5WAIy2Uxv(qx>%gj<h;0=BTpQ0=K*@>0R*N%uo
zX?*#{VESP;J9&SaKlF}6%{f*r-zf$i*3-rU8&ht;;@T3&8XHW-G!_t#bKUwd54CP=
z5jO`BVn&Ir%Cx7-Lmn7JHGMt`dr|8yx;~$f1>AxF?0vcoT6BG~ohHeA?T9#(R%7&7
zNMFe<w`iv1d^y~6PoPHI(Q`NEz<YtA&8ly1`<+8@4XRp5y<dXVdttH?Y$MLMZoR9k
z3qBnPx4-?c6hER#Y-FS$2;iMLC-Iv}wkMfy5(;M{8Z$8iGzI4MlCv%5xAuh#1J!Fk
z4F3^7BEG=(tNP~lfLqMT>^)^fI~pH4mFII^FWd<)u$LKZTJQOEeB8Wlur<l&%N&7a
zynpyTj*3RNj5GIDGG8+q4#vR*ESoD-4|~tS=JHxKw@+TZqUTTe1zmuCyh5r*-Qrfp
zYjm8l)Gk6sp7G6Wc{8topfXsl9K-9xV%sD6u3OeuJ_z76<D^=S#v-ZTCAuh?Zvn<R
zzW9@J;LUZAG0NU5K+}V6f-~M%mWCgO#Cv|6@s2*r&GxLuNy3a6d+I@N)Z?MPL4%O7
z0^M+yO_kF5h-U1D%ybq3m4-zFn#QgWaUM$MYevJNIHl2xtomLSfdlW24mFpzYPqUm
z+0yWXka*AIjMqE7=4>}JUfyvQHXY5x$ls<n1pk{iK+F|UI)Nu~+VpEY%-1FDeDj5*
zJL<uL^b43C#ou+xzQR}_$#attcaU11R6uI9OgEs2eBSLng};UaplWysyTmTyn<zNr
zb)!j{8OQna=X-Gm&oq&zDC9RHp3ji)?865s>6AU6b<4WS4*|;PT}H1M89FBzK2r2h
zGGBp&Lm}KO^1~GkV|>vBjOJav*v37SX+&FukH)#sYW~B$n<xaL@vwW-+RMQdzsysR
z)Hl~Pg>ZPMgBPWrj~wUyS3X<Fw8<i$^~<^nj{u|UbE4H)jC@PPCG!<XI21Bn`>t8#
z0H&a+Dlt|-9cpq=)poS{zY~p-m%+rwCm65mwhd?PAZ0ft?=kw5WIWH2&Fj|)FKMwU
zQvz{2&OEmvt1BM_{LQV4@!mt0!A03oqGyu%3M3p0>B;7*!HD0R?hpOLoadG}whVtu
z*U2xjc>pIq(0`2Zkh7;v3v6%;eZA2E&r#ft8onIj7pr3PL_FW2Mkt@n;5{OF-7%Mq
zPOO;!CEGojUm6Htqs(UsN)HeeOXe$(a3JhxY5D@HH96ZxiqfR%i33dzs7mhrE7a0(
zBbY99q^ue`-hXn&>*$u*0a2R$Ax$IGLA%E&LFofdE@)y~aolWBAxpDUG_X56Hh%(D
zkM+`t(qqk$sZv7#CW#4EBlQZ}CG!<XI1<J#0Yh*##&g>mcJF9yxQ{V?K4)HFyvwVH
zdCx2*#ye+DU?)uKP8LI`Dl~7N%S4^#5Z|h}<V)$x=KOrbu^y0oIYoV{%3c+58I~pj
z5NuvCC}*r@e!G8X9*;{3rP&|XQ;ib{@RG(=J%U(#0rURCin{w7c60>x=m|$&0F8!p
z&Lv*vVP`<q;CeGSb6;cK;l{YCgikEUeff5vV{Quzs24(yUyYWw;IxjsJ5Q&~gO@aB
zZjw+<{m_yj6D5a$Lg~jk^kTR?nup@V7xJlOz77o|H}==VJttaOFJ4(fob!IwkG>cV
za(0BDOEO+U=lPn0Q~ObcD~fU{V!Ot^%fE4aF_Tc$SvQ?byNq`n>n9N>0Pv35<>T0<
zkDaJm)>%vh7)c%#rfDkJfZSx^*l_4Qku&h28)0t%cBtps^NNv<?TFk4#(N-d8xrpu
zzL}-RcscE0ytjbyj^S}EYW&I9)b==!J8@yAuot-QrMzx<34GFtHqtg(;CLc+m}j#S
zZOJ+dhk#NuHo0i36K42$aJ4?NaM0wxB_<YS@?Z6!YlKz9B(RU~@rtlR>y6<mIc?CD
z7NG(|cgE<C;!V?t<GS8R_*|xwA-p{tmWf$I{x7?oFGQq|O2y(N|7es?jnk)3?}m_L
z=<fAceD)-nuVdp&;nvW7SFqK0IidP*!F;*ia$6Nq&MT$GH8Y~!F>*8dVfea*j|Hyl
zU2eXg_c#+f6{+q~oYCE<NxpgSgsiX_2!QcoSt7p87&EldoCe8!9U2K>ZCCd?)Kc!%
z23wj=h9vbnQ09empKUNyy?9zv+ef+A;GU9qw)@8c;95=*`m#x>u4#0hI(CrJ9%cW!
zr6Ks16YcS1j?f=Rq-rPVm30mP0Y)JBNai~L<obTZFnuEA=u9QS{0_~O?{uD_uVmUU
zFm3kYEBr21=_UT=(BGVH*QxrqYV<}(y~7jr&#$WstaYlbw0bH1Q(9hLGgxZZ9AkU&
z4^3XHmz5O-0mVat@rLyycqk=b2ZjNXrGmezOjRR)Vy1j2^I#rzz8((04Q91JR@a_U
zYrMz6OlpynZ8}+ZT-S?*Hshk(YE=4DC)=~sP3gK__w;sI@*GS+y5DJsrW&dh2b+;q
z7XSgK5yQS0Ryl^yAWh8*0D7^S;5*l_fnot++I1N47^Ln%eRJ@(V$qYYcDpswl9YNW
zz5V9K_I3HHJ6k<nLErw77#Rl)e>-_{K<+rp29(UtjsRmP@|zIA-sho|d^rkvzPw)2
zgO_xD#%d{9lQBQfnF|*#RKa+E1_u0n3^vq=w6(rDRPv0sv#sF}{CuzBaN&cp-BVLt
z5UE#VbZ6i=gevsUc^)cRU4<Y3^F*)(T!~OJUk3)hwl=T@{0`In^b6=azQc{5KCx=W
z8}q?<r(n3@NniQ7uPHbyoto0iiD_gCn4p(a4((3Yj2VH<AGthtEvDfNK1x3ty;F5!
zkA9vTkkyqv0+@&zj+kKg@G0-CWWEI*a;j6JI_Bz-cf~mNXGwT2A^MN;iG>7Cuk_Ds
zPuRj1w6l3{jPJ_B&ax}x71Qn!DsvP|a|eLq0pRxU6yAj-eT(mUGt(mDrHO!AQ~eZ#
zp=ewnlFZkUQS9vO9EOShKK9CCe)gaG=GOn&^YfMFPSA1Q+I^4-Z-A7x4jTckf{yo*
zeAQp<dbAypQ-u!)n91EyDEs7l?g`Xj;p?k#?U;984t7lW#}mIDW9K2ihZnWGwzf8T
z4w${XOqUJ<+#(0k?<(wYGkAksaC2nrwhx0=ojcF4T#PC39G3qp81zlVa@s5`2we=u
zI~1?%TUAs!V|r8Dd--pzI5lzxh@<QMkUlv>+N{>p6wKma-AB={8P4Suy2k0=zxn1^
zm9dWxaJn5?e~}QNgxhg{iT|FZ+M$4BK&wYSnD6F-Ee!!oWZndGxQa90_>{(TWjQfk
zj@f6Qgbzm_vDgOYOUN$^rR(MG5vF51Hm)kK<C`BM+dnq?XtXE)l^p^Kp?qelL?Q%c
zzZHJL2KEFv!x|5P>QoK|UhIG1z`OT#=6TOIv*kbeglFi`p?Nb=>gebg!6F;6(yw2|
zg~2$TG{&fMmA|>=^?cIphPpJJ7-l-07Dc6b|A=Ml<AMc&p&QoqoC|>0D9SL$U2Vpq
z%6=HrY~;xN5=Q{4$I&<~#&};VmiczJH0>KB+D~Bvd#27!u@V;;+4t~e!qhgZg<5<u
z=x88JvZvumW$DBNz4Q3?Pf@m5;aztHhHv?N6Zfi>WJut@IPgH3<Zo(wxL9;K*)DeG
zT@<gMzN6Oc{BI}QGS|KNb4c98*ckKZj2SbwoA1fEM4-P2=#;|KA-D{#id8#kBo@Fl
z_AJ8&HUomlhwhD=G_h(xbw1Ns2aZyIj~_6}ZD=z*lFaf(!(143M10|lck6~8Fy6Sq
zwrEsQ*iGficsp7fk3oNr<$cAX*Td7MO-uSl75#TA)M6yAa!}~sooYkYTR{luI{%m(
zS4A@4uCA^jkZ2PoO9G0%LvYmHbe;Wn^~%@pU$9_7#_hwrmfD6Ox8uvZJ@3MWf$A-v
zho8Vl;Uo*Sl>Vl!F?g3HuS^nrmmW@z!1-*ra8$cGq-J)Zy2jhav<HkI>RyGEjnzn@
z@RRMNBY>aOa-JLhJUpx-nJ<@|AVCe-MiejQq>by>T$2uMJcsKE@zU;*?fyXX-+e0#
z+g<i`^Yh>i3->$d@M{s`+faVj?Af>Nj*b})%rRSgVEXjHXNBtJ9l?g3FxE$5b$Wwd
z<=s@Mc4a%|k3e^fo>`I1mk;exC)6}hqFe!{WqgVHJKT^yr*U(UOwVoELWbj|-Pe2%
z%y{XIH{Lj4eK9`?8FdD1b*_X-+4M=1CTZsUvWiE;#<4xTPhf(@*<D7-I|a3_D60XK
zoiQW8boFVGu2|-K$|-@EZ)W?#p`&&`j#A}j0Z8s4J9zB}k&LPfRfW|p{0^Aw!V=z=
zfl8<I=LbgW+#Mt%qf&L!Wu&U+j9Z%8K1k)2R}9}8c!$G=k;DaD|108!3^8^K3)!A*
zt9%it_OSnm-c>B~6#($vLlY0G{u!9%n<6h6QYBcWs1x8!#gF^ZWWI<{cDaa}Lw@WF
zbUSaD^9CB{jJJEUvA}k`r~|b8cI>VbPW1T#?~A;03_IH+hv9=`Y&d{9%4gJDUGQzT
zz&c#5vd0UBFErd|6{;=SPT>)V(S+KNvU4k*`EvLZ=3|S^LxGol{n%Z`-^w>Dz$r}#
zcF|YkMYK%sVGn%V-2CU0@c;lo07*naRJMlQ&Cg|xGjbYVFw1&B%Z=@u!*Gy8P7t?2
z!+l4eP;>2+DO1uv2BJ`zgqqIUL{}KjOe>_mH`y+u|90xs3eko&<U15C*9g8qY{`*{
zl0^VhxqUeC;4gbi=9?HszOZc>QA$2CGdQRy{=j`Dpj2{U3x{Nmj_CR|mgll0upaim
z%P%LfEVnaeceMC!MRQk+o6vc7O+4s`Yq-ip<dthUyRG3cAbp1DA7zTer$>mEQen6X
zhkg(`FO^gH7}PHNY1=pKR``}>i%uY*GkPws7XaTtAiw)89+k{DfiRe3lBveC;wOzJ
zZW}>eQ|OO4GkmT&4<H_APG^flZ}<_V0X^G&b3R@;${8;<e*{1kJ_=M<74j_{?|w^J
zFRqzDfJt91EIW+fCm7(S;U4Fm+L)R*qDnaG#Ygvkd%lxt%%;j#hwmhArBXNE?eUye
zO7z_&^G%Ezq*{wzE;#W;nFiFOSe@g5GwzkCoIMY`Jts8*-!|OQ($K{juTcl3AAtFM
z6O8`>d!6N)o;^DNNl&<r<h^9i7RrN}8PgckZZf9}6Q}<uOgplbLL<OE)^SOSw~KfB
zD9L;iBcK!YlUxSlmc}YbCa^R}T#!unk24FA6Ayv#5_1k5e?MkUXNvQ)$>6Wy3Ajx(
ztS?Z<`_(Erjay}8DqpU7dQbQ!;A-eOxta%7B5+1h*o&JHVv2ZvVOzy7OQEX^-KK1>
zFbEjxLWOM-tx4vaKmyx`4-}yE7IM`Am|{{&hPt8(j3V7fh7$KyFY3Ze^_GZd3b~H6
zJ4RNZu9NtEFr6q>)JqGy*prE}!shV>Ow2m4nIv&B@<}m#5`Xp?83_vbPQ`*=Lq_iU
zuYxux8!IRRc(3mj4NB&lK!PfI(v(Yp(@SX5-O~Tn*9SiArL1v_san#$&tA5pt>F}1
z(-#Rx;RfR+bV_|w+du7P%RL>|xYW-$<1;Q;IXL12S=65`jP*n~9a+aL+jE1>4~I9H
zY`F(pn|$7F2q@K_<SbS)Uokp*+@+*QLc0A(#LXdJ41;8Lv^j4uaX9L^ndzMIlAU-x
zXm>rFuKy6y-aUh<w-le9*VeEr=)NvRJJr1&Qj`NLu^>{bP@3G0l%mPm7(cDU+yXwt
z+dYcjW6qQD(nWwpwS5Hj>C<I#r_w{lGWurdXm30R5>ZM@rtFP@LKSLhfNoQt166}+
z)IsSdIOFXUUvulqf!0~d&xLjU7JP)piavnx+%wG|dPk%S6^7ax&SsiEjHq;bq1w#e
z4t;xSV|z~P5!vgIcif18lzcHNvs?V<K;+!pGC-8<+R@^_z?_HlWTvr)L>`w)6y|9a
zX>$nc28(T`qS5or_aX7tLW=GZj29B`UpzHq55O6370Gz#%n87W4ZYTkk)o2{nbQS}
z<KzmA?T(|5tA4g%ZOVo!0|8IWP*Fm~xV_>f-_Cab(VE8Y2NMMUOI<t-DeU3UEZ@^u
z{n`v^_gtMZI6_WcxPNC|bL*;9PAZ0#a@hZ(gt!<<?rwv`dp#uHdAJpClhVIVJji>|
zq)9U(qIji+v7YeNLg&r<4(9jz48M0tT|?oKU^?Z_d{`xHW+^<nmE9tXs#D?!&}|jV
ze7Wux76vzg@$!ufSr$2%0`_?}oUW(+VqmBDoUb|POS^##Zl<*E!=7Etk6|;$zWh{U
zZN~;57;jVZIjY>Y-$B*wZa5($NmGF%9Iv$>ge}M0<E!*wuA5TwNp5gf^WiwfgqyiN
zkk5JtQCHdTW*h`ICHE-P_zt%?KdD&e+u7!ygtL?^yVY_XNig0y*lji=TX}A#1jcJP
zgh`fRdSk3-!w&%b&dEGVpYjgdsbkWlX%#E+axB(;5V;7r>@RQu<NbK_E;R%8w=C)z
zNJwey7>u0?gWGop@_gQ(q2$rL*C{J38UmQzB`tX>Qs;}K*2m~N{rNyfpfASD>%=Aj
z!&Wz~ewJdqoWay6^twVGiIYZ;3o=5X&?x*MZ!?oZ74p=SDS>WtUZLWUNscwzqO>UX
zwv_6Yk2cm0@KcdjIUvA)>rkidA^K90%$G)$v+=@@79ivn(r%o1H>o#?SJZ<Sc$|o5
z3}Mxvs>+#i6&vzybPHw><_#{ihcM5ajJOpb6U^>x-b~G~s-i5m!GQcL1?Z-1q6i2~
zoH%hCW>xWxJNT$pCi7MJCbfQClzyaAcB#K9^k9xposq2^zI+-X>bGY!w*OzYGL@1a
zn$`Rz=+mwK6DWDHSQa0_S-!7g33Lp{f9ll0rb-zH+0VitU^vC1)KD>EJ-<Sa5vm&V
z7?|MFz95HBfj98?3#bQ<tZNSawXgExJ|y4aB5n@RCU;zUgTJ|Tu{pm|<BV#TL+Z6)
z1%v0J=z5JYpvuKu-wV}J!8cb_>TjTu`pq5dpuh3Oeej=(TyU$euiuIjBF^2hCHy@s
zy1}d{zG(uCaWN#HHE`><R2|}de8h;|H=BL>=JG?~$I9b$<DI~O(=SGzs&5VrNY|WR
zS!w5RyWZH2o@r*Bfd7jV4m!MOj1RloCK>d=#@8+AQbl>vEwo5F28Y0#FNeoqre`d;
zLFMp^ie$bV!rI!v76kKfPT2gr?<O@{whW^)X3W?=ICu%!xbeSUn6VvUuLpXLwF>oE
zr`YyF6_nO?^h|-2X|bCI^ZjG7{1rn5aws9u{rM0;9al691N=@duzZjRsc1`>a(X3p
z=*oD)wT_g2e$)xxM=F;2ia|}-lICq?ebZ-=%zhsfdPiMT<_#HwmP!dbI|I9DJw5g2
z{_>Ag^{|@z&3OflQ|$zN42-n^9CG;gg0^jKtRmN8tPqrEDyFF;6sqwJM9cXG9y!o?
z9%o3sDZdY*Zdf;r+1bl?w8;wy#Cm!r+ofLE)wUOFk;{BaGIadQ_ul{TjJp=);7J7X
zf<WRUFPU#%&@@%T7~h&|x#08nFU7K3wGiTh#DWD2s<5DSrI{3@H<7B5zcuF-GTzx5
z#)-k~V53b+9xj&ZgSQWrhar6zvj4KJ!Xkh%Uc+0I%r}p?hg$v9A<;p1XuZH(iej*D
zrNSMljqBdK6tjcFEYv9MA)n8;*^*bl1Z-=QA52KTFBPz9*;EA~;Hi!n8!RA<y~?{S
znQxv3KiJZ63&fV$nX1EXx-VqPugE;gU|yEGAm>GyO4?F6g^j`LWhBP!Ox`MJ*<LlI
zT7D=CI|LQenjWR!=AGq!Wq2r=FGpdJmteeN!Ny)4wv5K7wDubnmYx&z9f_ab`POdJ
z*Zuy`YHMEMQsJ$U39v3OBoe$6Z%msO*ig9Etu66w@75fdDlY_>F)OZ8RKuK7GT%Xr
z?t%q@p}|=VKZAW}2bfA$Eb&qw$M<y)qN$R9lzT3VMagaWKvkJLgR|{Y9^TA13KZ9L
z@^#S^_bDOb#R#dUo`Bzx*#c?x*N&@RQ4CsSoB2h6pGuCI)o#4jYDG7!tIPwP-`2{!
zHk`ZmjScI=_uysO8~fHaU><QN7BS!D-G-$54KNvFiV=r_Rg=h;3laZqJY9MJ*L@T{
z6B4H3H$n8CTNczewl5Ot;)MFwM7xZs#WXP^OxJc7w8TRilRky5h?Hz9y5(B?%vdtt
z0gNW3-Rp4f+TC!=iH{lGq^R`ty2ke9rmF4@wfH^-*i(~v3Y(COS6F1>!a()vl|AWt
zT}I~?6ADh(suf`{Uwjt$<?*aB!Xo{{#|Z8$6SNeG2HC7L2vDXU7_XTwt8EAx&UGa7
z?FZI;Z}zgK;XC0Bo$t;{V99<{*BCT2UL#*6o;zYBmSnytpLz2|KiV(VqHErcd=dTs
zl9_&dwY2JCJN?<5SLpbv6&h$_U*zqA@x9^m2i6pt25WolIBS~3Y-yP+Q3Q0NW4(hC
zg|Zh#7qaRpMFAPc5#UnNvL!tWA(<qO0YN^c>lANIC=*8Nu(50LLMM&&;i~Ycgkz0N
z#mhQR*Q^!}yt9}zJXvgC#*}JiW>>UDKhxkF3xCNpWsfK0GE%Y#aNBmQ`|JVE`}r`H
zVtpunN#-k%=q2n5`u1PFGW;@x-HSyrNN!72g`QPc7g#Iu%wdADye{(a>`t#P4O<S;
z6!*$G<c@}L8eS4N(N~3%KZ$tJLZ&Bm&~P~SM$za%!yT=ST=zV*m^8??ok3vJr|%z%
zHK864rKjG|5d2)UBAKs1B1^~_ZzM*afy8S#%K_8H9whq6h_N+iL(kj$3(Kg*|1YNA
zKg`lh*=Da^@y6xo_ZKW3r1XQ)J9~?&a^SNNNZ$ttx~NJQi+E8&%0>Yvc+`|vxgmgq
zch6*s@(g@<|J&R*$$ZUd*y6KWn)VAv$<tuG`<ia?gEKF_{`$Z+Q(nd>tJ41o-9(SL
z3F-G6bLR#|i*&h%MqjBu)80=kQg#-5yY;7^avS&52^w%Cy4g-=LDP&`bm9eVTQ=q>
z0t(TOQwY7s-iTzrc0>|s7`#u^SnL@jC#H(f7u1?@7o^g?#}b<d-cOZcp2)Uv+7vu>
zzr12STO*fbka|C&)uZk$c2D1ZD|(2DhSei?8o$z%E_f6;C04@=-n4AcK?Lq>_m9&V
zJtI>SlKEypWln43cp9UB!!0x8T`_W;_q@US{KeUEH)UwDEVE$mTx;(bq~0KRW^d0g
zbQ;DM`=#2XANAL1e(G80VhW`h*Atv;QP!Mq1iBf6eZOVnOtJ~de3MYbBOM)qy<)od
zqFL1dhOq%gpx??x)nKcR8v;Z1V7b0Hr`>OuH(7GaEYZ>Gy9(0Wq1JBEZB>*$X3cX!
ziWb#>uK@s7!;Ss^W;7IqylhU(c<CX4lX8BLjcUn!O~`P+u6m@W3%c2TL^fYwQxx`N
z|IC)8<$}1EONcW{Evx!qNL4Y7q)CLi)Kw~F;{I{zv^07td*zm<whz+si;$tS#1ZjD
zXgk*?b&c$uYgt*&eF)5N_wS75`6IwX!c{}7-mz0CnXd_vP;2BS7@(TGZc!bgYh7Ja
z=#Qpi{fbi0h{X_TKC?c=uH>gVSo6v!6>5#0B&<WMy;58w$>9TQwW+ld>WY+3SaR#L
zD+rQA^b3I=jU5g(Bll&}lgVL|WWEVFVP|=a)g`3CczwR6;GBLTZY#|F)M7?y@m~Px
zca$E9l<n#^^r;#3+EpIiHJb+TLEK*q6%MIfBI3mg;o}wG19_Db0+8G$@&05|De-*C
zd;!AjwuU3%)Z_u780V@6c)0TWo7+1%NQ1tpVzKw-^<heB49uYGc3A~{ss%1`qMa;Y
zx>ZwZKC$N)F`cw_P${#`Oo~AjJGB_C$~K)rfNw5>)O(3hJ0<UDYNg_tZ)jG-w>6FN
z-D?v#K#+*mK+AhIB;MJX02@eNm8yLR=A&7kWv5xw<&oO1(HrrpAD7k}MqjC~4}6%G
zUxW-Scpi*qjo&`Td%aq-v&Hzm2qf|C*8eO<yRuDZ5P0|PH%`W&j^VW{jAY!vQ?bmK
z?{I<>^jY{i{R$FqH3tbh*A@&NU2~{!X8V0Sov&-UKlBgOW!;oP*|@oF4K^2o`D(E^
z^>ClPV7%YV*t$x8k}<ysnb8>c9u_~T7%HLV#r845A;3pqOA(-uO*(`CtQ@{Bs#dF5
zMt<TE1u7}*YHi%}m8IcdgHa@npJ7fmtG=l{00!)c?hWtyQ(*Eb#)Pk8gb;3W=BBCm
zpneAHKZ@dPHnzXL5=i}TkJ)KF->~A2iyvhqM%lIn+#Li8DyYp=Pvpzc?Fc}st=2h_
zl<Y_dUuh&)oa7s7YdAN;w6{}?mu@4}^Nspu=NK<X-J_B_FcQXih*5w=K>9KB;${hk
z@NsOi6iIae%y*t!HZgD;YpC>3=hT6&C?zVNmALa_=1o5CBvDe8839Ig!-XtnG#_Rz
zQ9%oC-1amy%YPfu^@lO#91;hFZo^9Vx&G$X#ZF1Q0{9sX!I$yUzc128sI$p;-4z&a
z$t%%BS65&xjs^U#oqRC37vb;tXZHMJrEzS~kr;n&{=@aGD}6It-Yga^=5~RTdp`nZ
zLS;NB1n?W%lNZFezL4#KijaKgwKeS8(c1F{^t&^~iw9V?sUC8kuc__DY~bWQA8shk
zbz<f_VcVt%oGh3A73~gxJJUj&s?s{AKWzO$51T746Xg}c7K(VWL!GIv*nCoK9Lk~n
z_V)Ij(GO$$I&{C5*{->NWzT%MXRRGNgG)&A7142gjka^fTh|cG`c)ET^BH<PWWK#V
zrs*@dRBq2NrF7g`H2blgX6VtMo9=Ia)t+DE^jAgupl?9C7RB@V486lVa8F9R^I4~?
zt2hW$Y2BRH81J-7>Eqd`DCw6zTc^zV;jLjmB(kJUC%*^gJEhQ!m$$pmen%{a6u64V
z#Rcj2?9O1gq;<a=)_wS8eB2HaeG;MKA<b@*dIeqakjAQpdo42&>;1x}5<8&eu|b7}
z-I3N}L_e8V3_gSX0(1?JXS&Lo`G#5>PXcqh!P0R`I_jH3zqjNSC^2c$ByQ`w%-$T1
z+4%`SrpTUODru}+-D;;5QsuJ~_CNd;dw#Lg*hq#o5dLB?Hg#%XQ?cnWw~u%AS#w&(
zOBDfjQiAdoaG?Hw*?Lj7%opCt_8?6ADV9}+COZ$ti_H`MWV#3ukB7~(Hw&rp@Kq}!
zSKISTA>G;5a0otd7n*71Y8_VPy)N;wG2beB9Cp1R*QFQ51|NBqGy-uUiXF>4!HCl9
znnGDJU%tS`B0YZtQ~Qz_Nl0R|eN8nD#hwON58LD4U`}rfx>6y#CKWH8SWHhXU<s=6
zqnqb1`bofonp7AJ1|~qd9GS{-R}8<==iNn-q^7?JtXkPq53Xy>V!gwv)7E&*^_3m-
z<-P@abp1In-sHJqIIOX=zS--iG>~=W(F}@{@!Hzju%goMr?<h5H>CCS8?MdaSW}`;
zvk*w>N5e){&$i^bB%x^0-SE2@<`Qz!$QZrLgug%`lG6K9GOS!o5z{(bt{IG;X>^U!
zXEJ#*Gk3eJm@j7r+hgQ;GvkFc(N)(Jtak(B6~KFHc7D~q@MheLgpU(tN+Ilyv3u;y
z3<}k@+S)(~Y?Y;tx((yBjVO%m*}W+;!EJO(dNL!xRSwW<#od7Buye9(m@hZi>efgL
zG%a5g&~V1<n;E>N*s2CWj|dH40+c`>ZM`6tUt-O1Zz`_qrN&M%K_%2F&*N}W*KDW|
zbR|Qknei?9EGS8rjFbfe@4pj155kDouyP>k%YAjKY?yCHOXS<wt~*6^7y!)Wj8~-H
z7Aj0lv;V^ae94%c#dOw&4_(qiga6=gLhJ2r!`Vy^R@zedGP2ry&qeVyE}}(}(hmfp
zG3GOhGx|5|ocvE;kCp}Vg-O|kaGUUR(OEEppZJ<XR$ZjH>5|YhY%JgNZDjzI?9La3
zTtZ@2GJ><34+pQcNWCcha$R%qiL4J5`No_%ft|1ra$t%K_iV68(S#UfuG#oup-U!u
zMSyQKLV@^hRZ5@iwQ%BQ$ur;E+x-W^r0fBR`-Z;;&Uovaf>U$pd@;*#)dSQa*wvJ}
z(1$7-i&)IIQg_`GqcqmUr&lS8XoOPrrc~N}F&2*O2FZ6PL78W0^<oisbExag^ucg5
zrIk(F=Jjh*!iYlN$_0&*X1@HS;i$$QNitqS=hZjWOmmx#H`_ocDoL~X#TgfqvXys3
ze&VExubXM13cJ6)sm;~Sz%{e`P)b(batqmD?t@$6Sg=g4S;T2#NpnfJ&?xChM}Sc_
zk>|p)Dmt99SJO+R=aek-<&MZ<%gcQ>7$#+qco$DR`0(j&vT8_&+nO_K+=N%srC)l)
z?3O0ODTlSRJ5o5Q0*8#IPO9L#-fERV|LgLLkqLrAcr$z3t-lj&z&l9LDZ|o10ONKo
zrxxe7J$FMx@bg?KFInarHreuW23Dge-*V%Qs@?{{FfC&CMJiUqsCI5|&lT$)KdAzC
zy;jMbl6$84L+=!errg_2Zux|C!gA8d<z9FGO5$6#-dr0#6o;A9GVVSELbIB`gvEYi
zoK(W*=1O5nGT)9?-!CBXCL{#f&4@bn=EnAQxpvx3<%W9L(~uswnrUJ>^M^vAQRX~1
z#MgfqJ_ooTV5X%?p&jP58{@Ov{l`NxNqMznWQ`l?<TSza#7j2XjOnZ`PbOW2fTHS#
z6^%mauVyp_pU#!El48F6M9@%+|8DrmypDGeOlt>ri(LR`=+EccVYij5Z3upfj)cr~
zVPR-D7TtV@Ij`7p{xN`IY|^46LC@W{Gz4>6Tqq{}T8y6*4NEv&Tuq`^jefb9^vJf$
z3jtl%ZpJTb+mKP-eyulo_^ymqE3_{uG<F1s1dBlbjZwD349_vg)NDQFlt60Rqxoim
z;#|kOdgby(ki3iq2+nwM3dQ;LjqN4x3CdIBy&KNR>%kyWeg_!)PDhLXPxZ~g#|i{-
zug$RN7VKGPJBvbC7+O1GT#ak(s0E!176gWFTpvCcznl2Q=rcZ_Z?m}X_7G%`>*(kh
zQD0wgnH{;E?n<E#X0FG<+Uh`+lDqr6vr}2MGW<9p(d9a0zvO?h82dYkVkOx0w`^(8
zKXC-lVlab?iUBG*cmj01PYcG2uS`$5FkX)5^yz`mpoi?TV+%?1PHtUb&o6X3t^#2{
zFt-cmIgQs}AJ}Hjb7Oqtr_sZpk4r9K!ZO3Pw#XUrV{C6FvNdTUm$8h@EE)m|(NBfY
z#2xqE+g~DZk_tJduZO_&4n_5+Kp%DgoYuzieU;DGeZFPbVE9b?D`-gQ>}WSt=|$xr
z@tSd}8dd`}0}k^vUwHAbWxAu-Ctc9krt{1e4w$<?u;0Pe_nAp`Z(O(d%Ye?|Irq{k
zDJvxeIy*au;Y`4KybqgTYMGOHnYn)ommt$#dUFX=91L(9sj1p-%5p5)+0wKx&RzQr
zq;Y=WWx^Ac!FVt8HMZxh)OL)(JXg#;AYm#$-(D@Gw#glBQ40gHmvK6csqbl~Hda>+
z@zze76o{De+#A)&S%NsIs@^>MM0KS|%TOXP7zAQH8-GZtK8)(hg9E8YF%FQd@_q$B
z%-{A=!D<|d@m)Se?xWnGZ(~tyibi7p0@M9=sxpJ|P6FeF+r9F<hI>dC#vx4~$(VL)
zv83N%&^LzZ^s+eOG@>Z#6@~Z><YR;lcZ`%JKU}bQ=q~o!MLzl^>nJ_~d@TWt7pLSX
z-H>2s4}>BQe~3H2#>+jqqb2+q)C*}o5nxCAu1eI?W;C|GJCH{D^Nex4J7LXWybuKA
z7hhb%QuXC^jqOi~ymAb=arW$1-;Xp#?zi-hk<;b`!$%@-nI*rM1Ye{f(M{I^$K#<g
zNHQSf5lU7-XZsRi>cjq-EwA@d4IwcF+lKYwiRYei`%X4<U$$=!ACCSHN#;>9p`*3&
z81%t)#~USSl#!A^VB@-QBThxxMFn4)(L6A|XfR*iTfXB53oElobn`dyvhHq108`)4
zjF|&#z+6vX@s1YXtziBZ#>*Giz<4i%)0VPkyaJn@_pEt%?FV6OT44K&JRahlx62Xa
z%zRmN(^S|C-)62O8E0e<)W{bK{K1x{GhwQi)z1MoRa^$O12@r%GUl0?tfK@Xnr8dx
zFt)4C^d<PfTXpN!zWlmcQM#rzwS8pGE=&q;P>;YnP?*MKD+59Rv%!gY8?J;uliTW=
z+6K&*z}He&+ysZ-o66xa8mUngSVTB`agz<5(dlN1m$wZQj!P?r@p6PGPY(2`3Tv{X
zjctpU;Uk!w4cPCerFBx3mY0^nJRU{QszwUUYPcHXVVL$|BJymXQgT}T#m=BsiQgcH
zdBEI8_wqL_!gQGZ73Tup<BQX>Y&BN|XpFm4rOTnR9gw;~v}0kuJQFtPcGW^`YGeB+
z;sGZ@&Ul%wLzm0q(_5nv&Uo2{mBe@jwslRRKSI^>Cy_Qn#>UTXYq-&p*IS}`)EtcF
z%aB;Jh*4N3P{@yPy6V3(Rb%LwY5a&F)S*%4@ntIK);zu;2Q#<}20|Y`H17@MxxqdI
z%O23nlC#ml)+e9OKLXtE3S-J4s!Cf1TYDPwg><#tTq%ylurp)x?Lo&AM#>P~O&bgd
zeaoZLBkP(&e;srKC0c>v_1*|E*SgPNXSZ-qZn<=c&tm*DgIw~PXD^R>*J5Yj8TCz}
zpWE+=G@R3a3sTsjJPm>SZ8(Sim$+97;fJf1@7r%WDg7MN{?L909hQb!zv~*n1F#j%
z@}V3+W%3b65YS01piuq4*@Tw<G@8bI6>rTtNK9KymGr(KV=CC+DD;h|Vt2?>c*#eJ
zXCY;+^eW1!`RJUevlPR+y1-g2iuuj7<IUVhkAxx7=6*qrYQc17qj-W#vDn6VMQ?V}
z#_`uwpDvz(gxoITr4mMCR);iTs+(-T>r_{8=qL8&UpH*j?t9|&6MwNBG*Ot9q==F+
zw;{lNgkT;y#-l2Y`JgS0`PSD5J~Y3q`Dnym2R8u<wfY+fi#=;*ym&c!AdyWe)?$a9
z&SIy>?ppl^Fw3{ibWlbvg)VlIIj^_4-yZ_wrKv>$Mm{ji3wwEJ)yhbNkb1Gw_5_^M
z|E`zfuDpd0)v1Co&UmpnavVPDU*wiMB431f37WBu`&ayAX^~fqP)feVKs~oM`AHR;
zM&`oQ^7r}jXOhNzxqCFI39%aP%muyW5nDbFKZTWeD6LHwrJukN@l|yV!52)q6&;;2
zCD0AV*=Ys=3q>9rnH1>i>PneVnG5t0hk1;b+bCu$))n-@%7Xov-v{I0Iz*w@Wh?9Q
z{N!v3m@TYtZv7wss6(_&Go|GIOkC2J&A?o$9$YdQW->OeUvn??Z+m!#Ry}RbA51(=
zxv!|27aTr^mGb^K+tKR3S=aShcyfgKY5d^+?xD)JrZ=^{XU>;#LWYjo{kPjUeasiT
zCJH<tm3|?z{nK9}eU-@TJ!Eet@5796!#eFCq+d_wB_dqiuzE!gd`!e0=wM-?wp3kp
zf`c6*`lqN$a`AGPy}W;gDIZMD%!|9CWN|}2BMHv(YoN{YUdB8y(8%#JgeMxk01w1{
zg5Yb5Qx=|t<liHzV5;^pW<;w+T86zNfOV<!VGwW<W>CN3J5hTto%`K1$rp{BoBQLr
zmE=yiL$iE1?uX3*<47`IFkKD1=IW0=q2}w7@n*)7PlZ(vy#{h9{Op)7b_M(z|InP1
zHBMF74flPmeminHURrE7H?GxCdo1kxDR-8D0C|9*4D1Ipi1@`#`4F>U$+QHc5`~Hy
zvb2RwMe8}QFOr~&`WAMEzRpMtPO?Zl-m|I3A7M8`mR`hkZHf3JQTm@yix2kB&4>0?
zZ>8U#KYzX#xIKj5(C*b$?4Eq$k;Z&=qIhi$Txd)8E6Js)wI7E6jF<XC{1~tyBz18$
zgRdz#TW<MGVUj5ZRm>6$J{<3)E{>(H)4OK3`1^b@Q%_}&aa9CVK?_6)=tiH6t35<c
z>4XWUVn~lrGpwKC_rBa4^%`L(^mf0X_o(eeYc#I2w0AN|65sv}rh6wN`tKp!90sYm
zI(ai|1gE7if<8x|xwG9rE^9G)gY;}&_g(ykz_;jU`985FjrqdO*{f#EVP<4unC5$Z
z#krCi)7VF#D>=tnmSR~~`6jn#wdTuIHU#*=p!fmUcDOOvCb3_2v^0Go+jF_*Z~Qbo
z9pkhoFOTWf2DQf9n(MQ!D@S9;hz~tx-CCdLwK00JTl#<({X;t~l(D_+c|6_K6&MTG
zukdlruFrJ8uRI7uyKL3y)qRz#)cY{&+yjo%=K&|Y5~N>NM!8^?#(bd$NE`fIDzca1
z3pX3rzxxjmezx@<jVTJd+BdWP57vB{>J0&dfmC^~#LM}yxg4gLyD%1eelUk_@(mSW
zR+q-><?0Q3(~KE`_2za;D{k6W%;=*fMLSb0!Vjk%?-2LvVYxWt4Q~to6`5bP+~{{g
zKepeGRKNqqI}zzIPDaN&_gVGOo$tzbx6_z!J{q{a-G5*>MxVzEok6EW?}l;z@AJ`4
zrPaY5AmQ}KpV<4&B^Zrp^X>V$raRgrXG7{W7O0?ye+(bS`*MBOjpeXIExB%ieG9AX
zG(sP|#GHpROU-Gw$2nt&cK39EEvGT%e$&qAZvB3y0v-spggO5*@WrP7In$<1+mY{X
zr!ikmvl@EiWqhkYjbe-B<cLO=iYcu5S?Ce(hZF9p<~$ki9|B``o-h|<lIE2Q4CU-l
zTf>?Ct1VGl(-YHHx~+;TwF9cdB@b~^Mu>`|I61(HVS`ATz89*9&&=fFCHvHzE@s@T
zk{9gl<Ktkl^`70@(<sg{Oe>3-5#F<(!5;PK)F1f~=FKI4+?KtsPNnAreMe%A#aIV`
zCScKi2i44VqAgEar4){{Axeo0`XrdX^nLMF!S2nU+yx!v)p$PLR1K#ez-;P!lP1+h
z`mRlu$c6wvZhkhrzu^=47?_4vltUcaP5qbfu7B{188fzvd>&%w)Hh5+YQ9DIEF}^n
zVA{L_@|#<l+CH!pD6s?<4WS>6n`&XbVmlTqUn-?8M0J*?Mu5rR|A{{EvkZH5R%a+9
zL?%Lv-B#>~?4s49ZqIPu8&_rgoSRHxM=2KOQWi(1Dr7W)KrBjr0Ua;-|Mso~PKx5{
zS9Q<sEOMiQL=+Gek7yJX#Tzw>Ni=vt5<kxv@fHx+*$c)!heHtJ5tE=Xo>}7&jdxT~
zP>G7S5hIrn6%cm0XLe`0zW<xuneFcBxv!qx>fdj7y6&o1-T!{|>eYLS!ryl&?51ov
z4v64<@r4|0oCO|tGpC{2b<D1<-!21Cll|C8;AL!;>U2)d_X#57xC%D}n-{O(*|^<2
z0s!nFzjw=1<lZEdHgBH04{&vSC``Yc`F};f3l_M00eJt67g@pEyQ8@K-cu>QkS~1v
z&j|G&jIX6aJh+-@-=a{_JpKxQtj2u-QXMnvD8n28SAS7fRqwU{-f(r$BYsXW$N2=*
z;YfL7pup_<s!`C34vOG}=D0Fbr`ZgRv<D2}JG-v@$9Q;yFJ>q+9@K6PwXRD-Ve@Z8
zZ3Q-J_}h4#E#$lOqO&!C<DZhP0&`gzPMrcHPcE4az<f5M1|iN4?B4ttsUJ1els)d-
z?pujD?WfNN<e3b;>+g0o`&I*ZPoXr-(*xO%Yw6p~(S2C61|YL0M{Q=TnY{;<EQ(!O
zhyHR$K7MPwBjcgGu-&`}uETyA0$}`p5=xW5@Ezdy&xAKj(r$RLw5mF*Cm+M$OGa+|
zns^0VQeXWf093(1@X9&$WrykGAzM4jQ|o$wX&ST^qUjd$_f+mUL!PG5Q!X?2q#Nzz
z$_;o-wwh}MxDE-b9|<wu47_c5$)+varz4GPkoC?s*HdWWhU%)L(dPq0{f3sLVPh>6
zANgyaO{bQYfdPC7H%9icJ%+Ogz3yTqIe9LOD!svrOAdiCp{wyER7Gv8izPHvmtTV+
zj{^O7GDpb_$j<SaWbbmcS!L&sVktT@2((shVJF-lV4t)deMAQI4qgICUJUTI?jHoz
z6{QtUPh1^KtM+7ELKGld1=wV}lm)~yn1@LDeL}ThI=Oha&}F^Bp>qGN`e)dIW3*Nz
zKpx12b(dhaQMV$QK2|L6L$na);B`Gkv{9@FfOtQZxq|^MqRj*Py1VlIJ@>f!Zv8>r
z8+?zjVd-+YzsWW|6z1n|qQT?!53N<BNXKCd;&AaNg1=#|*^nOwau|1v#>usM3A_f%
zWF-KHqR#_9A-7HA63Xw>C~b?P<n0=K07PIN*OW5PdmW*+yE|$t_QOi<WMjNIxS^)J
zmi*oYecu@qa~cXd{fq*<)Df(744N)j;q;4TC@H6%FVMz_e%m&)rwDk#G=5d2vb%CI
z6bByIfthLu4Dq)djyB1HRHo{b^~9$@+4TDBu_`$W56^+|b9cTRr1uKvuYbb$dzeQ@
zFV@!0zGI*TaRjs<M2kk=IYfy$9}`-I=4+~;c-TF79Wb}@fZ@)p?Q}=9-yX4Ji}%)O
zX>IX*4f=nAF3$nY&5UIFVfz)jG<ll<1-5MPVZKSHSJ?q!@1?hu*CnN1n?MtmSzum)
zEx^|_Ed^qf$L=Y=i#tn>m%wXD=d4F!9vf3_1cIYIQZ1YBa>S`Z@6`kO^_k|r07RE=
z+2H*&;1_Q~FZOP5iT5P(fx;^94CMXYsujM+NQKf|Z??j)NpS>x^~D5VylXF*+jj(f
z1v`J{vE(`oZ`M~p>^2hZDlnw*t6X~LcEED~Bpgk>OXJ!vAWql|^4z}o{W)I9A&C$I
zGN=1^v9s1iM%XiOk(v~5-7?r*RVx=>tBpIdAF+M&nuE;wDXcl1Tl88Mw4~d;)wh2r
zuNF{Sr&ZM&6xc&)1^3Ib(S%7$<>FkMh;sL*QZ`x{*qa!xssV_qu!Ai4P@%Cyid6Ap
zw0VqHPpo+8JCD&?nri@-$6Dn>{E1faiN*ndFNE&;!`N(|5h~+XoIB#l1I?S$p|V!L
zt(tNuJVYOgx$6(-0+-_5Giaum%fQ(MzO%-O_P2Sqt~2MA;rJAQ701g!O~9CPQ=14c
zewj?=z(ht~v{c)&v1I}tiaYQCTjpsi#g{3Ha+AIWSXUYYgMDE=Q)3$MgBN|YJ}%;3
zu<js+aYF=|@W}b_v&Nhs{Ug2N1I+Oh#uB0#Ck!@((<18Teth*-hSD+6e`0Cc8Vb`d
zv`%okP@IE1<Cp}jPbqu}3<SN>t=E80lht7+V>s{!wJ&Oq;MlN@xfAuiq$=93Ddi1c
z==)>7CtY2rjNJy#Z@M`xnJ|_>(@U#7HRdw3A*%$`sDOaE+7}uxx@E996ofwn^&E(~
z34*V_PCVt9N3jNv)kpZ<6ExpXpC0|5F9PN?h0@XP`6z!_D4eTbJcm2cR!l|%;~t*L
z{@^*T1(OuiN`Ou5D>O~~MPEN?_|1U*N`|+a%hV^y`xq1;dnf@B_zZLUh^wmMniyHL
zl7xASeqq=3d3=d~7Cnbd2~vPI85w&IAicVcBFbufEk5kow`lo|H@bJwK?mQ?)*}H@
z7XUDZBF`}ZpZ>VV24;+lb2<&U0cQf11JKq$ksg}@;j1@JD~&@v=U|TZ4#%c=g9i|J
z#=gd1XjwZ3G58R20I!hC+7#)UnPPk?oS_Cc3Fpjl?*+RKn<DDvY-U-N#}yGTBP=OE
zE9Na5*1QJZs)wt}Gr}@cmLnZ8>vxEoVX{twZ%iT~5#Ay2pS%aOUmD^V+rC+A#s@6m
z-R=W-uJ_JN**bEEN=5tzwMy82$cY_#DiW}e2PRNYM8%O2?JFD--<aA+zJ37tRxB9|
zyYi7OhGN9gXaOH0c_l&tA-ZFK#v73pT-0P}a@?JY+F3+pojd~KBtEmYyb-{AA{d%8
z@-^I;;7c-A93B26I(KdS4)Vodp0=3tVcFiIaPb@rLLUkn1BZc=+6yRh3h5^TwAPvP
zgJV1j#3Al5;S0;MH&ADDumEQR0VKL*aI9+h*XZ-7v2ujdQs@_nJkYoQV9d|MjOK2k
z!CM&9#)2j%XM$S|_bt~qoc#;x=VVJ@gQ93lsCkTy7b$#kkG+dF7Z)=v(Iqf#XgQr*
z4C0BMJ9`QLu8s6Di}`#)u;hRi!z@eihDT!b>TQTq{*fqo;J|WA-@S?#>+^5l&h`j+
zz31rTg7<<A!E<&@L_F*P;F(&BIUfou=p*n85rk1Zf{)X?_?XgaDSROxt%ObrM?|?H
zyB*j_d>C#2pFRu1MSd3iI8V{X%lk+Qz{UmHt{w}~-leXpI;h&_=Zf=iGdApy5jU(_
z@nHZD#x*p-p>E;<Y6W*QAF{^B%cgri2*qdW7uL}(j8~Ua;Y(xr7$cdEV1b`dJ=jG)
z5--I;_z=J#4fnxYEe<B+<@zhNN+$||SeKwrLjlsKBB~hlHGoiFdS_@9wtYEw!NC51
zvqi5Mvhu$fx)LRTLna$pKnsaVLb*P0RXRszNY8v1Dt#nWakhV<G6*neJMtZ^E$p-0
z0bhuh;&}nX+i~F|G#T&ByO^uLcf_{cH8wV?n>KyXZAX*$mlRXXmDeX#)Ehnk<}#fz
z+%9go5+Clb`VJ`07X}1LYnZ%P#3m(nKe#r41q|)*k9l$NKZ0)NV-=EV2~&Vv-v}PK
z2%DUf@lqK#_Y$rp(;4|<$Qn=2bgh}J&q@XgYgAXl-Ti}^Y$sQB&8l_%I-s#f4efE(
z#Ebe2b-7&OqvUibgmjY)`GNs#A~Sv&faTec8z|B|0TTjlS%kGhz&!25lKW?g+JuN~
zr|@Ft)>iD>A~e#DxfCC;)1X$CplB>RV~xxChMJvPRl6GZ^Fm?yn^_9Drq_Io8b5}~
zz<Xgba5k1jm!{tWHtaZWKt0>>(Hw`DZ)9?HiqmG-myMcQ-tbOx<?@-Ew`i((HlIC6
zr)^DxXDFk8PcXu)uc~^T`g->nR9w4Zofl#}hD}`kMRq&HV_P=*)<HZ$?<|V@M*(oc
ztJYu~KJd^yA~4R|vYYym%L~A}0PnzsTw(X;8+d8lN-hU4+lo&CS_%#u?z|c@aU=H=
zzo=D?_sGm3{_ZHPbdJJw-(!_2RvdIlpT|;o{+)H@17hW~QQ9)>RW#YCKUwJt$<#|!
z<$8QvBlk_HMIrVB;GWw4X8>MG#*+7&Jrwp3-FJnn-Ou59wAbWN_-s3jS%R+}X2H}4
zGTuyv=ka1aW=cvbh}~Q;Mr{|#KZhN1)GbK^aL~$~xWy~9|G-S;)7Y#?>)=ADKbiXV
z?`k<qUl+KVf9b=8c8@)YZHe}o_B#}&o&z9VpJ*9~8^~(flMV+z=a%w@NR_ojD=MIT
zbo?AgeddDe#h$R31Ks&A?a+M*zIK=ei$2gsRQKKkufmu5af{?&6keq5s;V60;En@e
z5Tq2czBY}I_5#>gJeC}phRTl{H!g6(DaE$}fd3GROY|2f)cWG#9Se20MX&|@ER_8(
zYv(2sts*=5xag^`7@h|rHk<V6cE2y_J@B<;`bfw5Id`0mxqmrsInJI?Req9dy60lM
z08rdMXCTAOnwE><Xm$+Vm#r|J`ep{I%t#gZ2+GUO6liR87q495eHenfOCtNpmtjLJ
ztX79}>L=j1<cpB`_36mVm%6H)r|Nh9-+06Ac=s*e60A*+@5q<@9X<>=t;Rmb8uo7h
zy=U|BLgPd54A}eN4n9^2p9~v*^a%RMBtIH^l%Tm;w#54p)+~qXd*tX=G9Z>m>7X00
zZ5z&}gE!DZBE9pDOJOe*G3AM_r~^!1o?CbNKGGnDp;aG@P~F%;nEc*J@Xas%6KXvO
zmP>qJLykAtSilIrlm->H(_tgv4Sha*@gK%1p3C%c<O|HFX|LfWkLxB156N@TO&*N)
zkJQ&;xQ9MBq#Q3ZMO}y;^n`V54Rs_LX!|hvR-s?}wio>XYLDR^{UociO?GH9R6~=Y
zO5dS);H%Fo@7pLqYGq9u*N{P_-x1f#iuw%JYmDTZPEzcU?-@v-eeznNsw^SxhoGAn
z&JFsD0NDTYRF{#~L-?YNy%I$kjfawK6-2!Vc(F`9A9}M#^Wr{zpqM@$9PdJG*+6av
zJ>bxV(|F)O$wA}OkPUz6sV={qz%i7Zjo+0k{3q&wHycglrOrYZo#cgR7L9J*wEAKK
z-e3|}tL+55)F%nP7L#VjsN+UCmjUejhQ4ueFfFSLaTMTeX=I$3N-nYW@xcQsiUI@V
zK^qeKB>nwy1fnFipoiBry}pr*kU-o4Yg~>Ij4L$&@D-r~0D3UM;wTtcdJrZVmw9T-
z${%>ZT@p&q<S+J)E;iQ33rHo-*mHeI?N*%d2AaN{yzig@4p<!oIqzR62G6Hob@sWh
zBh?;v5`6734{>`z+%G`Yc#;d62(7Iw0@%Pg{nK0_KvDJknZW`b<$)Q!m$3s}Z_XbI
zgPYxF<(Dn<Lt*_d=_W&sZTbPjM;y*o<rnB5G!6KUP@W(NYocz$dM_PQzmMG1hte|f
zi@q7bm-985s5jqr_<UXrO<t)|fS3gw2zwFFeA~)PC|A(2>{RQ^Ryhg2*`jP#>P^hL
zSRkNUzr4cQNYb>*hH9_}uSD;%xQ0AiM@y#-F0zFRt6D!TRuq_d<agmOtTKg{fHr`x
zg`4`1XV$v@MDpfnbtMyzgCqU*`C%{kd3{*kcTEbc`pS0`?8pqG(L+r3G*)Ym+L00x
zeC;?Z7JZ`4186*)WswvSg`=crL_|^qXO{Ynkl~7M3cno7vnWP}gad~y{z!J7E$h`q
z(vCVr5;rSO=iyL)n}Iq%??@IU@F;POFQC2esVTqN<MH(FNK5XHj?9`%;G@$iHjsHh
zjk^hI9=i;;QNZJI?~i?eITXV4x9a#)ipSZNHWGa8Iy0tzHEmdXi76^-5Nz*BR*8s@
zlm)kEB>At(9KQruMp>STQOz5@ldS5siX*!paH3ib8w*D`ROJXoVSmQ!u#57dA3qW}
z>d9J~zx33)9%y&@ZdFZ;xWI`dUyhNXV3WYLN$^c|_F8v8(f(X)RfT$1P+XU4U0E(C
z!8gkU&P4S%Rdy<LcN5w$fOz1@C_2Gfk%K)Mt3PDF0}lk`sBIu=&I{1IJ!p;1oDd^S
zT=mq7`p-%$J-<^OYz)L3pM|oczI||b<4T-90w?Tx16mT-&z(EBTPQu&FXYJye~e<)
zBhxw$1z?nDJah<-#UqDPe9Cq0_GoN7KncFKotn0e1%X%7o(I3!BVJd@<xe1>J%_w)
z8x6Vw;Aq!cxl-3<3Eobt^1K9g@*Z<Mg~3s%T=*ZGRabtLReHR*Y2^)X_3Q6E9{#c>
zqCc%2`GtbIPY0mh)3S5Z=a38EM7qr#>Bc*<WCo^pv@y2xgfu8scb?JYW+XG~%Fl$&
z@QHYafN9DfA?J;HGBh_mir<O^-}p4jTMhv4$vCkV6L>YBH@JBPJFXWqKVJlHa)0X%
zb4!f{tGuOE_0#bBTjodo{R07UXQCXcxqdQp=&GupFB~NYgTp4z*{HJP!pk!ha^c@E
zTjKkwp~mHzIn({4j?}h0ZMSy>J_Qk49_D;9+%+gbS~Mb{VN*>z8iJ@N4jAs7SO_DR
zUs#*Y5^Y9_mgd*cy-05f`aW~M3HY@VoMZTyu|ApZvGji&y1BnL$Hxmp6mTc%g)NB4
z3!wS>9=(6`vlzzEBMVu?1dr3c)p9YkgCS3bOCgi#!(b7pu)(i)#k7i|_bKeD^*skR
zJi@(|!X1Z^tpkf@3MvX8(%<4ZEM(*zmptLtK@0sI0dGqHW-)~rfM@dK9x6Zcrt*f(
z`aE{Mmz=L%XQp-QZ_u^az)O9H^UPSl3*>za<|Gq39cLy5jwC}oz%6CXX!DF++rGQm
zTqfBtZHUkk5g&)+73Me;R>}T?&xO}e541cqEwF56t@kZYjqhzM;3cC%-05gClq=8x
z+=mP;(5~mS+jNiuz`LjrftLs+!8fOmOEQhMAB#+;h$%0C^U;3&hg_jIs$<F$H`qgE
z6#(#0tTMuX+H9O)n_`ugO5C)n+Lf+K&s8c{e+(|>VSsQ@J0esU03YToqZk8%zr1PF
z+~0@t7WS_=+T8y@Ao4JcIZsy#ld7R6{I3Ie2jW?Q7>kWA6plAEPZE4Xl(66L_H?>L
zzzf;eRTCym&~0obUn4!x-BorLE$OUE69B^y%xzX_sm29C7>0nHMdBv_;I|T~2j(*e
z!xhsJiR86=9y*^L(Z^Qbxe;+)Gy<n3j}gNE3go+c(&jBrSl(8M2J|KP+Vufe6b}H;
zmu&YS#Qy<H@M<!~PDa_2EpL^djBazTUWfiz>i`QeE7Jf>{g_<l`FhwPM~~);dcIyo
z&8m;!h)duu=kIfBD+XJq7fvjc9pqk2l<8`yF254gc?ZBdP;hnyZQd5D2~j!8Q6||A
z(eTWB4zwNM=9-@laEYpVg@^H7s9vqbm$VUF>f&rIzo<V<)1Cxh);*FwT!Xx}s_)*;
zJ422eBlDyl)8_mz@G{yq*tw0Yk$;6F&Sw>}?b?Q#vggfmZQ${Bkc(T&-y><z`C?|x
za_^(HcpHSqu@@pR#f9r6H8eXCd_$D5@$Z8Avc0{6!zY(E63cKL(d7pziuNfcuL&-|
z7u{N3w-GOA<We;YV+wu{0^%vW0M<v1Ie!7(0&bX6S+_RR%?ANk3w(ru?iH(Ue2o6f
z6lD_Gn~0Uqu4yn3|4)qZ=lcG2W$=ae$N8{Ob_brh1A`l@T+E$TSyGqIUWT5w`MQ_j
zn=i^2aJ$%Kn+#bmUWO<a=ih-dxwN#R{*5R(QjUi1Zz(i`J@G40;6hz{)v-s(<x{4(
ze~6#oJ~<@k+cp=oXS@6Ry}pMq-)GRNH8383R!jPo+R;K;Do>A;`KCg(E?x#U8|K1<
z%{gXmgfLmK0}XM>Yd9{vY*u~QDdG4G+(7o6&;}e7?%_h6o!h$0w-)l~S~?Y%f#ziu
zIiNseW1~vC&i>uLwGh9+Kb`m<2iI<Z*yX0q0=z^<3BEaam^xz<IEMU6cLy8NZcKf*
zd5n<F(x&}LEg%-d_5APesIAz~oF_M7h)0B0;rDu1;n2u~v{Rm&c4aH6Pyl`ik6pE5
z`Fm}LOmK>kGuIw{N)Ctbwfj@4&(39}*ul=TQy_gKU~Sa`_W^Iuq|FekZY!Lv7fhMc
zvtRMT8*g;)u*#E29JwbVFByisbjnY#h0uuj7Jz>V(g3hgWYS24O!!U64(IhB<b3I>
ztK7x|9fpeY6X=R!2QpXMywNuv2BhZW!`%}y?u8It-7~GCetCXoA^WXL@XbDryG#qS
z>&uRZ-G_xSH4->q2l%_+!ITG+wK_!}3RbP~{u}i}|F`{ud>I>jqx2hmWMGK-*|lir
zYAjp#4rMj}0)$uKgV5;gUVP`|$?h%Y^n8W88u*gaS9toV!@S}U4d13J;<{;-p2zY<
z|6H{z!8ccw?jr4wzqaq!`~HYc64;W6p&rNhPAn-O=fzGp+5Sz0Jm7kN1+F*zQUW}I
zoyvL9v>E`yeX;F!8g|O3p)aGNwSkvlBXn!uaHwqFv7?+H<l6>pR{~!W1p#y~r@6s=
zE`e=^vDs3Uj5h!IikIM<FUoiFb|JePscG5+06n-Big9t)q%c0Uw5nlYj9iR1%>28&
z?_m>dNH`z3-m<EOABDqFZpaHr^TsvjVo6IPiSc;Zi=t#D0C*Q$A&(1(_Sbx!n(7MR
z3u|?QSc~^zd>Bsy(RZp$Ts*b1<ds5KAB*-}S;b56wW5lQ%LoOC3*NB4WdfG|(*b;u
zUY5hH;D81fZ*oPchkDwp;b`3rJYFn>;q|L^QMly`;Ihcl>Y6cMM~Ia51HP6U(5H)H
zeHe%v1`t0456$<|ap#U2vVUg<zL1$$mR5P{vfp}kTOd=Utvi3HzzSvxK7vljb{mDm
zSz1=+UX$IH?N_@5U;E8Tr|Z#O^<{hR2#6}YY%zfnU>AUmhPN44r<YaLE(#F>YDZTA
zy#IvqYAB9=aqRdMS7qH2eO#jVBtKrW!gqFniOV6pAdWc7rC>u&e8^RPJ9&DJk}0Fn
zR^SWn@F8&d!R2{I8_Q8;WIFWUOTE`%4lD3C2qGtF$S5-~e{*tmKqyB;O}H5ez6p~`
zW=w|y&_dudYo;H`SnCXk4lamYSCA8a7cb=#FswcdT=8YTqfr0=5G+YVK~#-+VU5o!
zL=LXX`hC-Vaj9Q-+;RJ!0k3x)+8_`<Eut+%;==&^XaB=pqfnH$T@|%YrQ3Ei8d!lZ
z+A7H7#X`81S@kWWq4!-0x}F{!Gw18E@)%n--Sa^-8gzwt;{)9lMl8pecM6d8F~R-y
z0JAaRDx=!0h}BVm(el#Ddid+g?6K-Q>&gfCgmzIo5TAly1GgRO6JDHe1XmWQ&f-Th
zZOXE<fbX38vP1lSHUVF)pl1ot%!~QDsi^zjk4~OEIYKu)MBA?NE5Wy`%(M;1NzOju
zbIub>KR9N$i`X!1imi&7Cyhj$&O1r-vl?*fIHhO(k$^FwHQ<KtEoRCilPl}M8K-|0
zI`{@RI5(}i9y0nFZL)q2`|#|aj*@=A=}afo(w%7uzUfj>mQR}k@KtvnPNB_%2w+%J
zm2v*kK7*VWWc#25K=?GA)T;&;d}`GGalQ>N(?iZ4&dN!X+|5z*C6It)*$Y66oAp@>
zMeG3i{8#|42ecX<>_6vCips7oEpK>6KOT86j~B~lzy=h+rpt3B3O@-uXxPo?iQ6C+
zVgZ#7YVpkX>wF*3;f`z$174RGt0jBFXPoj|fZ%G&wuI;fz`W77-TUFp+Oo?m^CTK6
zcuJdyiB_MDd}L=AI=k=SSnyLJ;Dv_9TS~F>NC~{w6qlT@HC<$Eo+!|6Gx#~Yz(t97
zfy06so1>IC=Vw~}YVO0#nKOGb|JD=WE%#Q4OHPA0qK7#TZO$R>MOEQ&`dR;xIXzyO
z4u`N-W;k(YN6z=|yWKsuZEhKd;f@2hdk&2&xG~B3O5EH6?T{DajkPPMOYrT=vv1R}
zL+1G+;0v2^$z2NP%|N}oN>qN=#ESY2$(2q%H*L=C4zP0pK11PEVJ!L#Z4UhfQ3PiX
z;uPPkL{ITiX##lB(zFg^=^d)gPT<>6T?HR={wpEJ9gi^&2=|fk&!Licb2dzghT7;%
zzY=^q(`<H<v0(Z77J%pEREGn=+YTP=Hkcs%Q>t|&TaGrFwr*TKIsluq0Hpo&<*_NZ
zlWF`u?sU#e6og?3ZV|X@aMo>CzQK1^jcZImD8J;2I29Xd2ZuHwgPg8lFF^C-weG!(
z7v6G<dt0c_@;5dGB>2Xrk4()11z@A$H-fiuwrxhT)o1X#i)qR?LIkdbG0I7_ZjWr5
zHnsTgdhS{L!meH2MZmX(^4$Wki()*7e}U!i6N<u?Mmsqd*+zOXvui4c`dMHp`ZUm7
z6F}Gp^}2Upk77<VdlXcr-GI<?j4kXkiTv`prZ>zm*O|@m+_~<4tvkIpLlt@=#0q_k
zZSntcg)c|H>Psu@a3ot^$x^`L0Z+CHnO{;MF$yeL;O@1v+4l;#Cd&;n;-aiUx2=9`
z;J9(b_I7=1q{Kh7*7sY%SY31w1|Q6LGptw7+oPBB&TMmMbee0~QlAU`abrBzdcq3-
znrW<2QTY=?4m#>ny%R_}5xCti4Bj`;|5pLNXY0qFjr-t9v?adt(eLZ<5j-n29{>s%
zA5s=VtNJsgr1<sYj$=RM+W=8E#-GbNCHUr&+FiUYIwp+W@cUCG131|0ODjEpuqscR
zKifC^D%!bbr&XSC9On;$YkREkUV~qsFkwP#I6lKS#03X<-~w>FC*kjO^sR3Q%#2O(
zFYzMc#2gFiEF%xuwf(6n&le}0;`~Xb0gt%sl`Fm7RX;l}z?74@(5}aaY%la>Cq7yK
z!H4oKaO$rXciU@`JQ!h|k#s{6eAA_%ET1q1pn^3~)5N@l3q)k(i{VQCOg)!q&N8c}
z@*qtFt^;Ub2Oc&AVqB1;E@uia&y~#vI<x_M#{f8nWjxy_M%S#9xH6@zsy@2!JnK^7
z*trd^16%ycxfFpV#zg?wo&fqo5DGg(McjY!x1flrPiN}TNVo&?#y|lHz6Ns0KsG5r
z4iN*pgJvQ3PqiIrG+YAnlYfs_9^k^~G?b5oqt630!C^s5jIb($<$Mb`(#I7=neR}v
zZG(o5SRD)8t?EcNjzlH1>%8M(PH`&051IqXTznZ`+NBVoYydaCz1SgM_4^&7cmMsq
zF>~=aymB&7_89ncx&SivK4|MF`j$`+*-`!&AD}m37<pxp8u+GXzmm@<OmOeg=acu@
zp@0P6?9f%#8ifK7JzR(-_2U47B=mUomGNCrXBro4!%Cl<?t>dxU$%^eQ_Y#+IER?i
z^<i**SZc!$hVBB6Z-dCA5x~Dl?N+=#Q(iBEmes!UUH}j-GXtZp`Gju4#gq8t8*s5+
z)CCir92VllQPzd|J8*znFN*stE-fvEW{AA<K>-QA`Jj3iZ_`s#eyb4LomON3_(9gl
zK5b7s9zYWDj3Uo&$RRIER*64o+S1P^1E8)Yxu#Y7apTBe9Ai+Y5^M1rw#ub3wproo
zVqJMZv;f}Mg`?h*-Q$hxeZK$*-;R1?8=sF_OPmA_?F7zYDURLE9KGL{pc{u?IY`Fy
zBQHsT+)|*86ZL%u4!B=L%ZhqoS!LbR6hEiIbzG~TO~vw>Og)+|?uE$Ux31~+kD3zV
z2ItLl_wns$xfDPLU1G6603N=>GJ65!Nb>PLYDG6WR{i1=hzgbf07>ob4Xf73yMyL~
zyL2WLtMp{z(9TxS?M1ZtG49_JYOnWQF4xv%>X3P}O92VK*`=?nJqiUp)visE+$aCi
zRpmTf?@ERc%u`!74M5_tstO=C-&N(AXqA>o9LZnj*0t{2s%aAelNX@9p^21BCl6;z
zGtd}7EH(z~5gZ>OF&g&{d<eRyS~oy;HRv&;&)&`_rQ9}ED(0@71mE1zTDGL4Kos!(
z0&aoA2McuBSwn5bMO*|H0-VA}Zrjp%3zqvwl~&b~tTwHSbX?zE+DaDI#-pMW@p80)
zJ6{e!+l~VuubM08TpI>)LvRxd>@B_};yipSpyrl7=W1>%UKo9R6Gkt`=F9I~y#C4b
zND8E6$U3qb86zo>T?#-H@gz9lL$R2zF}5U%HReAbacP5ZyACahJc42mV>^k5u&GFg
z6ixyNL*0$T!KWxrymV?gor?0SNoDm-sg;(c@<9O!zWJcKY%?Va@S;BRnXidXvklQ#
z01h?cD6c*}*Y{9`dlqfdUuoNMWB)Mp5hf}5P<%X2MF0C|vQOZ8Kiy-m!G;rYnXF&d
zl^qI5@XZcgWvy{2ARzDr5XH%5Or{-rPjwk|e8m)F3Ym7a=QXveb~SoNfBy>gFlgI?
zi7Jc(2OQGBAnMp+*6~fhR#`)~DUkNAYPLHl>z5RWM*%?ip7HY7DV;Pl6sPm2;8@cZ
zmC(4LSTUFmu-IvgvM=!|Ai+02jbskHQvjQAb89R1wR<=9{Ug(oMO-}x;s{blOY6eN
zXGp;7dzp?8r&W<GoKp%&@XaZ?Wn)AE(%*&ZSZds=A}kI^h+eHWllvwz)3;UDtrLoR
z0|0n^BKc$Gq0?+F0rnC3n~RlCrrC!A5`6757hR?&{(zd2I#$rbgWy4THAZ4_FWF@<
z#uY}04wP_b$tVa9zDl>pFmm1<hI^hSJG|-ECCg`@0up?)Ph;7Fi2~5qCDpKq3wv;}
zkK?v9kHaRNm1e`ox0N@f4PFr)%25P~ARq%i6Efi4(V7Hzyb)$FD`3;O5{D^1ik1&6
zioUJ8d_RwxE)#4+0SUggnToE_5poU*C5Nl5MhnYY>#X7_Zf0%82~dxSFsDd>1%u=j
z(aIG{e4BeA{<vj-$({svobIY@sE0B6x&ecnqu>?ee#^Xw)HHrVv@{uS9|}nDwa;91
z`kv5HUhZdA*dVfy+hXA@M1-X?vI8-z)-@jPg<b+^_cjuVdr{Z<u$AzdF}`3!?^yO+
zu&?wd_^!Ij?RAutl)`4jW~=<rerU0ZmvQ!@fCOKA%}1y1ji;^^wiDQ-=z|oO&#NmB
ziWP`MCi9c4R(L-NXpBry8h{rZ@8>}4^UA6`Vg1?Y^%h79R}$8Im@>uv14I#Ptwtue
z`kYm~jI$R7B>38EK00l0$gnqLy${A3Ah_6ld;#s>o|>{{%+FWA<<OG=-Qngk1iXr~
z<g88wyci^GpM8!0Fc)FEI2&}|eeSTg2}<>NJiS4$0j8Y6prUBNWqH}10up@fK0}?L
ze~pceDieX*0i<y)b^%ZZL2YUzHrA{?5_A3;FD^NI+BA2<Tw<Y+kVFgEbPPoFSBSw)
zn_7?6$BWi&*MLR^rQ59Uzr2SDJ4pmdfoxOYfB*Y}e1)Bnemi_uvC1yGt<?Q(`gIjp
z4I*$4jwM&C4zUhmn9*>LgyYHbDr5>s@Ga!o>q31Gs$nU7+->Fc1*=k)&UE491yh^a
z-~9ggwP+e6_d%Uy6EgmQ3p9rmI~3@|6;ih+PMon7x~VXTliCFV<{G#Sx+EFDZK<`?
zIZAP+UEDcF(z!<=1@oPIer+>4QZt)X>x1)5E0;h}jS|Z9!On8g7r&>2(;ESmDXh%+
z-<>L-JE@}nJKGE<iydjGF2hM_8h3gZoPAyd)0R)ASBos2O$wwv$=U3jth@72pxreP
zaleUh!1s2Zkta72Y`Eb{u8dV~IOCj*58g9+mrPdqWL*9zAjJ;(qrGfdQoulgIkgpo
z5s{&HQk;a9hUGDCL!&Z~HwsA3H*chu4ND3bDbOm!g|N+_8e`i+keT1BsO&LKQv%?Q
zogyH9hU3<~un~#Vf5rnSq4J2o1^3T{!t&Q%6p%7sd(DUJO&6vBoQkf6#9{wP67aW9
zIK_EJyN@|TvZuQ0aK>BT0nqj|Bp?7_UeUMz&?BUm3o~Wx5zf#O#~y=`y^s_rR0_<h
zagBkq%|Vv^=lma5z)N{u(`#_dnmuV%4(^3AAU(FKS;l380up>PK}A`Iq(B-JXsC7_
zgAKBWpyd$deH2H;s=m!#ePj-ZC%%<jZzNsp5));z?b1Ej4@rT}O@Uc;<s%>;co8hB
zrKPvv=mk448WkEE+(V#}XR)!ycNMAp`e@~3eEumQ!8iYNAbZe7DFEO-N@&{4QSDfC
z&}GjB3l=y#f^B!?*@OGNk3imQu$ci_uaB$z!a_DMZysxB49g9-&N!=bOtz=M0}r@M
zHmvu40^mF-k<P++`d@nXExFm}>lM(vn})(&&?VSXn+VHqee~r4yscbOehwSQuj=FF
zz5OU4Rn6=-C$dLffC4}Kz<MPLyhIN8XS;ISX781Zd3Dr@+)u-%-HSMeErC}@CA;2A
z@U`pA$bLx*BuN2qy+Vbbw<L?n?0KSql=<d~@Uq#iO@Us$1~!o>AxH)C(Z+al`V`^+
zm|9W)S^D+J8gfAaDf7(*m1P@}0u~e?|8%Rq3>@UxL*1bIT8iL!?J#)1JkgvL!i7B{
z-#sl99}EkzUnq>U9e9zKiX!prvNBIoN4&hVGX*60+Ie<l-y{VrC_tV;nwpxztJZ_R
y`%a8@yA!bVYf+huk`#~>kQ9&<kQC@j6!?EwJc9wxTd&Ii0000<MNUMnLSTaQeu-WH

literal 0
HcmV?d00001

diff --git a/docs/clustering.rst b/docs/clustering.rst
index 62d1dc8..829eac2 100644
--- a/docs/clustering.rst
+++ b/docs/clustering.rst
@@ -1,15 +1,20 @@
 Clustering network data
 =======================
 
+Clustering with infomap algorithm
+*********************************
 .. automodule:: semanticlayertools.clustering.infomap
    :members:
    :undoc-members:
 
-
+Clustering using Leiden algorithm
+*********************************
 .. automodule:: semanticlayertools.clustering.leiden
   :members:
   :undoc-members:
 
+Generating reports for time clusters
+************************************
 .. automodule:: semanticlayertools.clustering.reports
   :members:
   :undoc-members:
diff --git a/docs/conf.py b/docs/conf.py
index 8dbe953..24a23b7 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -18,11 +18,11 @@
 # -- Project information -----------------------------------------------------
 
 project = 'SemanticLayerTools'
-copyright = '2021, Malte Vogl'
+copyright = '2021, Malte Vogl (ModelSEN)'
 author = 'Malte Vogl'
 
 # The full version, including alpha/beta/rc tags
-release = '0.0.1'
+release = '0.0.3'
 
 master_doc = 'index'
 
@@ -57,3 +57,5 @@
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
+
+html_logo = '_static/logo.png'
diff --git a/docs/index.rst b/docs/index.rst
index a445cda..112785c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -3,13 +3,21 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to SemanticLayerTools's documentation!
-==============================================
+SemanticLayerTools documentation
+================================
 
-This project collects tools to build semantic layers from text corpora.
+This project collects tools to build, cluster, analysis and visualize
+social, semiotic or semantic layers from text corpora.
+
+The development is part of the research project `ModelSEN <https://modelsen.mpiwg-berlin.mpg.de>`_
+Socio-epistemic networks: Modelling Historical Knowledge Processes,
+part of Department I of the Max Planck Institut for the History of Science and
+funded by the Federal Ministry of Education and Research, Germany (Grant No. 01 UG2131).
+
+.. image:: _static/bmbf.png
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3
    :caption: Contents:
 
    cleaning
diff --git a/docs/linkage.rst b/docs/linkage.rst
index 3e65b98..8e3b989 100644
--- a/docs/linkage.rst
+++ b/docs/linkage.rst
@@ -1,11 +1,14 @@
 Word scoring and linkage
 ========================
 
+Link papers by Ngram scoring
+****************************
 .. automodule:: semanticlayertools.linkage.wordscore
    :members:
    :undoc-members:
 
-
+Generate network of cocitations
+*******************************
 .. automodule:: semanticlayertools.linkage.cocitation
   :members:
   :undoc-members:
diff --git a/src/semanticlayertools/clustering/infomap.py b/src/semanticlayertools/clustering/infomap.py
index f566d68..4bbabd2 100644
--- a/src/semanticlayertools/clustering/infomap.py
+++ b/src/semanticlayertools/clustering/infomap.py
@@ -1,45 +1,91 @@
 import os
+import re
 from tqdm import tqdm
 import infomap
 
 
 class Clustering():
-    """Cluster using infomap."""
+    """Cluster mulitlayer time-dependent networks using the infomap algorithm.
+
+    Calculates clusters using the infomap algorithm. Input files are assumed
+    to have multilayer Pajek format and contain the year in four digit format.
+    The default settings for running the method assume an undirected multilayer
+    network and will use at most 5 optimization runs.
+
+    :param inpath: Path to input pajek files
+    :type inpath: str
+    :param outpath: Path for writing resulting cluster data
+    :type outpath: str
+    :param recreate: Toggle recreation of already exisiting files
+    :type recreate: bool
+    :param infomapSettings: Initializing arguments for the infomap algorithm.
+    :type infomapSettings: str
+    :param debug: Toggle writing of debug info to standard output.
+    :type debug: bool
+
+    .. seealso::
+       Martin Rosvall and Carl T. Bergstrom (2008).
+       Maps of information flow reveal community structure in complex networks.
+       PNAS, 105, 1118. 10.1073/pnas.0706851105
+    """
 
     def __init__(
         self,
-        infomapSettings="-N5 -imultilayer -fundirected --silent"
+        inpath: str,
+        outpath: str,
+        recreate: bool = False,
+        infomapSettings: str = "-N5 -imultilayer -fundirected --silent",
+        debug: bool = False
     ):
+        self.inpath = inpath
+        self.outpath = outpath
         self.infomult = infomap.Infomap(infomapSettings)
+        self.recreate = recreate
+        self.debug = debug
+
+    def calcInfomap(self, inFilePath):
+        """Calculate clusters for one pajek file.
+
+        Writes found cluster (i.e. module) information in CLU and FlowTree file
+        format to output path.
+
+        :param inFilePath: Path to input pajek file
+        :type inFilePath: str
+        :raises OSError: If one of the output files for this year already exists.
+        :returns: Writes two files with found cluster information, method return value is empty
+        :rtype: None
 
-    def calcInfomap(self, inFilePath, outPath, recreate=False, debug=False):
-        """Calc clusters for one pajekt file."""
-        year = inFilePath.split(os.path.sep)[-1].split('_')[1].split('.')[0]
-        cluFilePath = f'{outPath}slice_{year}.clu'
-        ftreeFilePath = f'{outPath}slice_{year}.ftree'
+        .. seealso::
+          Infomap python documentation on mapequation
+          `Infomap module <https://mapequation.github.io/infomap/python/infomap.html>`_
+        """
+        filename = inFilePath.split(os.pathsep)[-1]
+        year = re.findall(r'\d{4}', filename)[0]
+        cluFilePath = f'{self.outpath}slice_{year}.clu'
+        ftreeFilePath = f'{self.outpath}slice_{year}.ftree'
         if os.path.isfile(cluFilePath) or os.path.isfile(ftreeFilePath):
-            if recreate is False:
-                raise IOError(
+            if self.recreate is False:
+                raise OSError(
                     f'Files at {cluFilePath} or {ftreeFilePath} exists. Set recreate = True to rewrite files.'
                     )
-            if recreate is True:
+            if self.recreate is True:
                 os.remove(cluFilePath)
                 os.remove(ftreeFilePath)
         self.infomult.readInputData(inFilePath)
         self.infomult.run()
         self.infomult.writeClu(cluFilePath)
         self.infomult.writeFlowTree(ftreeFilePath)
-        if debug:
+        if self.debug is True:
             print(
                 f"Clustered in {self.infomult.maxTreeDepth()} levels with codelength {self.infomult.codelength}"
             )
             print("\tDone: Slice {0}!".format(year))
         return
 
-    def run(self, pajekPath='./', outPath='./', recreate=False, debug=False):
-        """Calculate infomap clustering for all pajek files in path."""
+    def run(self):
+        """Calculate infomap clustering for all pajek files in input path."""
         pajekFiles = sorted(
-            [pajekPath + x for x in os.listdir(pajekPath) if x.endswith('.net')]
+            [self.inpath + x for x in os.listdir(self.inpath) if x.endswith('.net')]
         )
         for file in tqdm(pajekFiles):
-            self.calcInfomap(inFilePath=file, outPath=outPath, debug=debug)
+            self.calcInfomap(inFilePath=file)
diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py
index c695d21..2d2e561 100644
--- a/src/semanticlayertools/clustering/leiden.py
+++ b/src/semanticlayertools/clustering/leiden.py
@@ -1,24 +1,51 @@
 import os
 import time
 import re
-from typing import TypeVar
-
 from tqdm import tqdm
 
 import igraph as ig
 import leidenalg as la
 
-debugVar = TypeVar('debugVar', bool, str)
-
 
 class TimeCluster():
-    """Cluster time-sliced data with the Leiden algorithm."""
+    """Cluster time-sliced data with the Leiden algorithm.
+
+    Calculates temporal clusters of e.g. time-sliced cocitation or citation
+    data, using the Leiden algorithm . Two nodes are assumed to be identical in
+    different year slices, if the node name is the same.
+    This could be e.g. the bibcode or DOI.
+
+    Input files are assumed to include the year in the filename, have an ending
+    `_GC.net` to denote their giant component character and should be in Pajek
+    format.
+
+    The resolution parameter can be seen as a limiting density, above
+    which neighbouring nodes are considered a cluster. The interslice coupling
+    describes the influcence of yearly order on the clustering process. See doc
+    for the Leiden algorithm for more detailed info.
+
+    :param inpath: Path for input network data
+    :type inpath: str
+    :param outpath: Path for writing output data
+    :type outpath: str
+    :param resolution: Main parameter for the clustering quality function (Constant Pots Model)
+    :type resolution: float
+    :param intersliceCoupling: Coupling parameter between two year slices, also influences cluster detection
+    :type intersliceCoupling: float
+    :param timerange: The time range for considering input data (default=1945,2005))
+    :type timerange: tuple
+    :raises OSError: If the output file already exists at class instantiation
+
+    .. seealso::
+       Traag, V.A., Waltman. L., Van Eck, N.-J. (2018).
+       From Louvain to Leiden: guaranteeing well-connected communities.
+       Scientific reports, 9(1), 5233. 10.1038/s41598-019-41695-z
+    """
 
     def __init__(
         self, inpath: str, outpath: str,
         resolution: float = 0.003, intersliceCoupling: float = 0.4,
         timerange: tuple = (1945, 2005),
-        debug: debugVar = False
     ):
         starttime = time.time()
         self.inpath = inpath
@@ -26,7 +53,6 @@ def __init__(
         self.res_param = resolution
         self.interslice_param = intersliceCoupling
         self.timerange = timerange
-        self.debug = debug
 
         self.outfile = os.path.join(
             outpath,
@@ -57,8 +83,28 @@ def __init__(
             f"loaded in {time.time() - starttime} seconds."
         )
 
-    def optimize(self, clusterSizeCompare: int=1000):
-        """Optimize clusters accross time slices."""
+    def optimize(self, clusterSizeCompare: int = 1000):
+        """Optimize clusters accross time slices.
+
+        This runs the actual clustering and can be very time and memory
+        consuming for large networks. Depending on the obtained cluster results,
+        this method has to be run iteratively with varying resolution parameter.
+        Output is written to file, with filename containing chosen parameters.
+
+        The output CSV contains information on which node in which year belongs
+        to which cluster. As a first measure of returned clustering, the method
+        prints the number of clusters found above a threshold defined by
+        `clusterSizeCompare`. This does not influence the output clustering.
+
+        :param clusterSizeCompare: Threshold for `interesting` clusters
+        :type clusterSizeCompare: int
+        :returns: Tuple of output file path and list of found clusters in tuple format (node, year, cluster)
+        :rtype: tuple
+
+        .. seealso::
+           Documentation of time-layer creation routine:
+           `Leiden documentation <https://leidenalg.readthedocs.io/en/latest/multiplex.html#temporal-community-detection>`_
+        """
         starttime = time.time()
 
         layers, interslice_layer, _ = la.time_slices_to_layers(
@@ -109,7 +155,9 @@ def optimize(self, clusterSizeCompare: int=1000):
                 outfile.write(
                     f"{elem[0]},{elem[1]},{elem[2]}\n"
                 )
-        largeclu = [(x,len(x.vs)) for x in subgraphs if len(x.vs)>clusterSizeCompare]
+        largeclu = [
+            (x, len(x.vs)) for x in subgraphs if len(x.vs) > clusterSizeCompare
+        ]
         print(
             f'Finished in {time.time() - starttime} seconds.'
             f"Found {len(subgraphs)} clusters, with {len(largeclu)} larger then {clusterSizeCompare} nodes."
diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py
index a4b27c6..1b9ff04 100644
--- a/src/semanticlayertools/clustering/reports.py
+++ b/src/semanticlayertools/clustering/reports.py
@@ -9,7 +9,6 @@
 import textacy
 import textacy.tm
 import pandas as pd
-import numpy as np
 import warnings
 
 num_processes = multiprocessing.cpu_count()
@@ -19,15 +18,54 @@
 
 
 class ClusterReports():
+    """Generate reporting on time-clusters.
+
+    Generate reports to describe the content for all found clusters above a
+    minimal size by collecting metadata for all publications in each cluster,
+    finding the top 20 authors and affiliations of authors involved in the
+    cluster publications, and running basic NMF topic modelling with N=20 and
+    N=50 topics (english language models are used!).
+    For each cluster a report file is written to the output path.
+
+    Input CSV filename is used to create the output folder in output path. For
+    each cluster above the limit, a subfolder is created to contain all metadata
+    for the cluster. The metadata files are assumed to be in JSONL format and
+    contain the year in the filename.
+
+    :param infile: Path to input CSV file containing information on nodeid, clusterid, and year
+    :type infile: str
+    :param metadatapath: Path to JSONL (JSON line) formated metadata files.
+    :type metadatapath: str
+    :param outpath: Path to create output folder in, foldername reflects input filename
+    :type outpath: str
+
+    :param textcolumn: The dataframe column of metadata containing textutal for topic modelling (default=title)
+    :type textcolumn: str
+    :param numberProc: Number of CPU the routine will use (default = all!)
+    :type numberProc: int
+    :param minClusterSize: The minimal cluster size, above which clusters are considered (default=1000)
+    :type minClusterSize: int
+    :param timerange: Time range to evalute clusters for (usefull for limiting computation time, default = (1945, 2005))
+    :type timerange: tuple
+    """
 
     def __init__(
         self, infile: str, metadatapath: str, outpath: str,
+        textcolumn: str = 'title',
+        authorColumnName: str = 'author',
+        affiliationColumnName: str = 'aff',
+        publicationIDcolumn: str = 'nodeID',
         numberProc: int = num_processes, minClusterSize: int = 1000,
         timerange: tuple = (1945, 2005)
     ):
+        """Constructor method"""
         self.numberProc = numberProc
         self.minClusterSize = minClusterSize
         self.metadatapath = metadatapath
+        self.textcolumn = textcolumn
+        self.authorColumnName = authorColumnName
+        self.affiliationColumnName = affiliationColumnName
+        self.publicationIDcolumn = publicationIDcolumn
         clusterdf = pd.read_csv(infile)
         basedata = clusterdf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index()
         self.largeClusterList = list(
@@ -47,9 +85,19 @@ def __init__(
                 os.mkdir(os.path.join(self.outpath, f'Cluster_{clu}'))
 
     def create_corpus(self, dataframe):
-        """Create corpus out of dataframe."""
+        """Create corpus out of dataframe.
+
+        Using the text contained in the cluster metadata to generate a corpus.
+        After some basic preprocessing each text is used to generate a Spacy doc,
+        of which only the lemmatized words without stop words are considered.
+
+        :params dataframe: Input dataframe
+        :type dataframe: `pd.Dataframe`
+        :returns: A textacy corpus file with english as the base language
+        :rtype: `textacy.Corpus`
+        """
         docs = []
-        titles = [x[0] for x in dataframe.title.values if type(x) == list]
+        titles = [x[0] for x in dataframe[self.textcolumn].values if type(x) == list]
         for title in tqdm(titles, leave=False):
             try:
                 # text pre-processing
@@ -74,7 +122,22 @@ def create_corpus(self, dataframe):
     def find_topics(
         self, corpus_titles: list, n_topics: int, top_words: int,
     ):
-        """Calculate topics in corpus."""
+        """Calculate topics in corpus.
+
+        Use NMF algorithm to calculate topics in corpus file for `n_topics`
+        topics, returning `top_words` most common words for each topic.
+        Each word has to occure at least twice in the corpus and at most in 95%
+        of all documents.
+
+        :param corpus_titles: The corpus containing the preprocessed texts.
+        :type corpus_titles: `textacy.Corpus`
+        :param n_topics: Number of considered topics
+        :type n_topics: int
+        :param top_words: Number of returned words for each found topic
+        :type top_words: int
+        :returns: List of found topics with top occuring words
+        :rtype: str
+        """
         vectorizer = textacy.representations.vectorizers.Vectorizer(
             tf_type="linear",
             idf_type="smooth",
@@ -93,17 +156,26 @@ def find_topics(
         model.fit(doc_term_matrix)
 
         topics = []
-        for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_words):
-            topics.append("topic " + str(topic_idx) + ": " + "   ".join(top_terms))
+        for topic_idx, top_terms in model.top_topic_terms(
+            vectorizer.id_to_term, top_n=top_words
+        ):
+            topics.append(
+                "topic " + str(topic_idx) + ": " + "   ".join(top_terms)
+            )
         outtext = f'\n\n\tTopics in cluster for {n_topics} topics:\n'
         for topic in topics:
             outtext += f'\t\t{topic}\n'
         return outtext
 
-    def fullReport(self, cluster, authorColumnName: str = 'author',
-        affiliationColumnName: str = 'aff'
-    ):
-        """Generate full cluster report."""
+    def fullReport(self, cluster):
+        """Generate full cluster report for one cluster.
+
+        :param cluster: The cluster number to process
+        :type cluster: int or str
+        :raises ValueError: If input cluster data can not be read.
+        :returns: Report text with all gathered informations
+        :rtype: str
+        """
         starttime = time.time()
         clusterpath = os.path.join(self.outpath, f'Cluster_{cluster}')
         clusterfiles = os.listdir(clusterpath)
@@ -120,13 +192,13 @@ def fullReport(self, cluster, authorColumnName: str = 'author',
         inputnodes = set(basedf.node.values)
         notFound = inputnodes.difference(set(dfCluster.nodeID.values))
         topAuthors = Counter(
-            [x for y in dfCluster[authorColumnName].fillna('').values for x in y]
+            [x for y in dfCluster[self.authorColumnName].fillna('').values for x in y]
         ).most_common(20)
         authortext = ''
         for x in topAuthors:
             authortext += f'\t{x[0]}: {x[1]}\n'
         topAffils = Counter(
-            [x for y in dfCluster[affiliationColumnName].fillna('').values for x in y]
+            [x for y in dfCluster[self.affiliationColumnName].fillna('').values for x in y]
         ).most_common(21)
         affiltext = ''
         for x in topAffils[1:]:
@@ -156,12 +228,19 @@ def fullReport(self, cluster, authorColumnName: str = 'author',
         print('\t\tFinished topics.')
         return outtext
 
-    def _mergeData(self, filename, publicationIDcolumn: str = 'nodeID'):
+    def _mergeData(self, filename):
+        """Merge metadata for cluster nodes.
+
+        Writes all metadata for nodes in cluster to folders.
+
+        :param filename: Metadata input filename
+        :type filename: str
+        """
         filepath = os.path.join(self.metadatapath, filename)
         data = pd.read_json(filepath, lines=True)
         selectMerge = data.merge(
             self.clusternodes,
-            left_on=publicationIDcolumn,
+            left_on=self.publicationIDcolumn,
             right_on='node',
             how='inner'
         )
@@ -177,6 +256,14 @@ def _mergeData(self, filename, publicationIDcolumn: str = 'nodeID'):
         return ''
 
     def gatherClusterMetadata(self):
+        """Initial gathering of metadata for clusters.
+
+        For all files in the metadata path, call `_mergeData` if the found
+        year in the filename falls in the bounds.
+
+        This step needs to be run once, the all cluster metadata is generated
+        and can be reused.
+        """
         filenames = os.listdir(self.metadatapath)
         yearFiles = []
         for x in filenames:
@@ -191,6 +278,7 @@ def gatherClusterMetadata(self):
         return
 
     def writeReports(self):
+        """Generate reports and write to output path."""
         for cluster in tqdm(self.largeClusterList, leave=False):
             outtext = self.fullReport(cluster)
             with open(f'{self.outpath}Cluster_{cluster}.txt', 'w') as file:
diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py
index 2872b33..9d30fb6 100644
--- a/src/semanticlayertools/linkage/cocitation.py
+++ b/src/semanticlayertools/linkage/cocitation.py
@@ -1,4 +1,3 @@
-"""Link documents by cocitation."""
 import os
 import time
 import re

From 62d500a29db36ff186554727f6d9435cbdd56a0e Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 6 Jan 2022 13:32:13 +0100
Subject: [PATCH 25/53] add readthedocs yaml

---
 .readthedocs.yaml     | 15 +++++++++++++++
 docs/requirements.txt |  8 ++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 .readthedocs.yaml
 create mode 100644 docs/requirements.txt

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..3943fd8
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,15 @@
+# File: .readthedocs.yaml
+
+version: 2
+
+build:
+  os: "ubuntu-20.04"
+  tools:
+    python: "3.9"
+
+sphinx:
+  configuration: docs/conf.py
+
+python:
+  install:
+    - requirements: docs/requirements.txt
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..25dc0d6
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,8 @@
+sphinx
+sphinx_rtd_theme
+plotly
+hdbscan
+umap-learn
+torch
+sentence-transformers
+https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0.tar.gz#egg=en_core_web_lg

From 5c129dad54c4d7ca97b9af760ec298c2ba007c3b Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 6 Jan 2022 14:15:13 +0100
Subject: [PATCH 26/53] add req

---
 docs/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 25dc0d6..80636a9 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -6,3 +6,4 @@ umap-learn
 torch
 sentence-transformers
 https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0.tar.gz#egg=en_core_web_lg
+semanticlayertools

From 4b547464eafdc028bd9c0bfc9ed99a715055f457 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 6 Jan 2022 14:39:22 +0100
Subject: [PATCH 27/53] add readme and license files from mainpage to docs

---
 LICENSE => LICENSE.md |  0
 README.md             | 16 ++++++++++++++--
 docs/conf.py          |  4 +++-
 docs/index.rst        |  2 ++
 docs/license.rst      |  3 +++
 docs/readme.rst       |  3 +++
 docs/requirements.txt |  1 +
 tox.ini               |  1 +
 8 files changed, 27 insertions(+), 3 deletions(-)
 rename LICENSE => LICENSE.md (100%)
 create mode 100644 docs/license.rst
 create mode 100644 docs/readme.rst

diff --git a/LICENSE b/LICENSE.md
similarity index 100%
rename from LICENSE
rename to LICENSE.md
diff --git a/README.md b/README.md
index df7a75b..04d4856 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,15 @@
-# SemanticLayerTools
+## SemanticLayerTools
 
-Collects tools to create semantic layers in the socio-epistemic networks framework. Source material can be any structured corpus with metadata of authors, time, and at least one text column.
\ No newline at end of file
+Collects tools to create semantic layers in the socio-epistemic networks framework. Source material can be any structured corpus with metadata of authors, time, and at least one text column.
+
+## Installation
+
+Using pip `pip install semanticlayertools`
+
+## Testing
+
+Using tox `tox`
+
+## Building documentation
+
+Using tox `tox -e docs`
diff --git a/docs/conf.py b/docs/conf.py
index 24a23b7..dea5654 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -34,7 +34,8 @@
 # ones.
 extensions = [
     'sphinx.ext.autodoc',
-    'sphinx.ext.intersphinx'
+    'sphinx.ext.intersphinx',
+    'm2r2'
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -45,6 +46,7 @@
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 
+source_suffix = [".rst", ".md"]
 
 # -- Options for HTML output -------------------------------------------------
 
diff --git a/docs/index.rst b/docs/index.rst
index 112785c..f53ba76 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -20,11 +20,13 @@ funded by the Federal Ministry of Education and Research, Germany (Grant No. 01
    :maxdepth: 3
    :caption: Contents:
 
+   readme
    cleaning
    pipelines
    linkage
    clustering
    visual
+   license
 
 
 
diff --git a/docs/license.rst b/docs/license.rst
new file mode 100644
index 0000000..4f3620b
--- /dev/null
+++ b/docs/license.rst
@@ -0,0 +1,3 @@
+License
+=======
+.. mdinclude:: ../LICENSE.md
diff --git a/docs/readme.rst b/docs/readme.rst
new file mode 100644
index 0000000..60d167e
--- /dev/null
+++ b/docs/readme.rst
@@ -0,0 +1,3 @@
+README
+======
+.. mdinclude:: ../README.md
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 80636a9..63818de 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,5 +1,6 @@
 sphinx
 sphinx_rtd_theme
+m2r2
 plotly
 hdbscan
 umap-learn
diff --git a/tox.ini b/tox.ini
index 40887f1..24a7a07 100644
--- a/tox.ini
+++ b/tox.ini
@@ -21,6 +21,7 @@ basepython = python3.9
 deps =
     sphinx
     sphinx_rtd_theme
+    m2r2
     plotly
     hdbscan
     umap-learn

From 0eb2e2115561b5f6661f518be5b85a84c6ee79e6 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Fri, 7 Jan 2022 11:16:31 +0100
Subject: [PATCH 28/53] clean doc building

---
 README.md                                    | 47 +++++++++++++++-
 docs/index.rst                               |  2 +-
 docs/readme.rst                              |  4 +-
 docs/requirements.txt                        |  6 --
 docs/visual.rst                              | 58 +++++++++++++++++++-
 setup.cfg                                    |  7 +--
 src/semanticlayertools/clustering/reports.py |  6 +-
 src/semanticlayertools/visual/utils.py       | 31 ++++++-----
 tox.ini                                      |  6 --
 9 files changed, 124 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index 04d4856..ac127c3 100644
--- a/README.md
+++ b/README.md
@@ -2,14 +2,55 @@
 
 Collects tools to create semantic layers in the socio-epistemic networks framework. Source material can be any structured corpus with metadata of authors, time, and at least one text column.
 
+Documentation is available on [ReadTheDocs](https://semanticlayertools.readthedocs.io/).
+
 ## Installation
 
-Using pip `pip install semanticlayertools`
+tl;dr Use pip
+
+~~~bash
+pip install semanticlayertools
+~~~
+
+Consider using a clean virtual environment to keep your main packages separated.
+Create a new virtual environment and install the package
+
+~~~bash
+python3 -m venv env
+source env/bin/activate
+pip install semanticlayertools
+~~~
+
+To use some sentence embedding utility functions please install with the
+`embeddml` option
+
+~~~bash
+pip install semanticlayertools[embeddml]
+~~~
 
 ## Testing
 
-Using tox `tox`
+Tests can be run by installing the _dev_ requirements and running `tox`.
+
+~~~bash
+pip install semanticlayertools[dev]
+tox
+~~~
 
 ## Building documentation
 
-Using tox `tox -e docs`
+The documentation is build using _sphinx_. Install with the _dev_ option and run
+
+~~~bash
+pip install semanticlayertools[dev]
+tox -e docs
+~~~
+
+## Funding information
+
+The development is part of the research project [ModelSEN](https://modelsen.mpiwg-berlin.mpg.de)
+
+> Socio-epistemic networks: Modelling Historical Knowledge Processes,
+
+in Department I of the Max Planck Institute for the History of Science 
+and funded by the Federal Ministry of Education and Research, Germany (Grant No. 01 UG2131).
diff --git a/docs/index.rst b/docs/index.rst
index f53ba76..64da186 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -11,7 +11,7 @@ social, semiotic or semantic layers from text corpora.
 
 The development is part of the research project `ModelSEN <https://modelsen.mpiwg-berlin.mpg.de>`_
 Socio-epistemic networks: Modelling Historical Knowledge Processes,
-part of Department I of the Max Planck Institut for the History of Science and
+part of Department I of the Max Planck Institute for the History of Science and
 funded by the Federal Ministry of Education and Research, Germany (Grant No. 01 UG2131).
 
 .. image:: _static/bmbf.png
diff --git a/docs/readme.rst b/docs/readme.rst
index 60d167e..21a7aa4 100644
--- a/docs/readme.rst
+++ b/docs/readme.rst
@@ -1,3 +1,3 @@
-README
-======
+Introduction
+============
 .. mdinclude:: ../README.md
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 63818de..c9c7e88 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,10 +1,4 @@
 sphinx
 sphinx_rtd_theme
 m2r2
-plotly
-hdbscan
-umap-learn
-torch
-sentence-transformers
-https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0.tar.gz#egg=en_core_web_lg
 semanticlayertools
diff --git a/docs/visual.rst b/docs/visual.rst
index 29e6a31..e5e6eec 100644
--- a/docs/visual.rst
+++ b/docs/visual.rst
@@ -1,6 +1,58 @@
 Utility functions for visualizations
 ====================================
 
-.. automodule:: semanticlayertools.visual.utils
-   :members:
-   :undoc-members:
+The usage of some of these methods requires installing the package with
+the extra requirements for text embedding and clustering
+
+.. code-block:: bash
+   :linenos:
+
+   pip install semanticlayertools[embeddml]
+
+
+Representing temporal cluster evolution with a streamgraph
+**********************************************************
+
+This utility function is meant to support the visualization of calculated
+temporal clusters. Parameters to vary are the smoothing (bool) and the minimal
+cluster size to consider (default=1000).
+
+.. code-block:: python
+   :linenos:
+
+   streamgraph(file, smooth, minClusterSize)
+
+
+Embedding a text corpus in 2 dimensions
+***************************************
+
+Meant to be used to visualize a corpus on 2D by embedding a text column using
+the SentenceTransformer approach of SBERT and UMAP. Time consuming method!
+
+.. code-block:: python
+   :linenos:
+
+   embeddedTextPlotting(infolderpath, columnName, outpath, umapNeighors)
+
+.. seealso ::
+    `SBERT docs <https://www.sbert.net/index.html>`_
+    `UMAP docs <https://umap-learn.readthedocs.io/en/latest/index.html>`_
+
+
+Clustering texts using SentenceEmbedding
+****************************************
+
+Similar to the above method but extended to help finding large scale structures
+of a given text corpus. Similar to topic modelling, in addition makes use of
+HDBSCAN clustering. Reuses previously generated embedding of corpus.
+
+.. code-block:: python
+   :linenos:
+
+   embeddedTextClustering(
+       infolderpath, columnName, embeddingspath, outpath,
+       umapNeighors, umapComponents, hdbscanMinCluster
+   )
+
+.. seealso ::
+    `HDBSCAN docs <hdbscan.readthedocs.io>`_
diff --git a/setup.cfg b/setup.cfg
index a328a5e..6e49e3f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,16 +36,11 @@ install_requires =
 [options.extras_require]
 all =
   %(embeddml)s
-  %(doc)s
   %(dev)s
-  %(test)s
-doc =
-  sphinx
 dev =
   twine
-  %(test)s
-test =
   tox
+  sphinx
 embeddml =
   torch
   umap-learn
diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py
index 1b9ff04..2cbcd53 100644
--- a/src/semanticlayertools/clustering/reports.py
+++ b/src/semanticlayertools/clustering/reports.py
@@ -13,9 +13,6 @@
 
 num_processes = multiprocessing.cpu_count()
 
-mainLanguageCorp = 'en_core_web_lg'
-nlp = spacy.load(mainLanguageCorp)
-
 
 class ClusterReports():
     """Generate reporting on time-clusters.
@@ -96,6 +93,9 @@ def create_corpus(self, dataframe):
         :returns: A textacy corpus file with english as the base language
         :rtype: `textacy.Corpus`
         """
+        mainLanguageCorp = 'en_core_web_lg'
+        nlp = spacy.load(mainLanguageCorp)
+
         docs = []
         titles = [x[0] for x in dataframe[self.textcolumn].values if type(x) == list]
         for title in tqdm(titles, leave=False):
diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py
index 5952052..85febac 100644
--- a/src/semanticlayertools/visual/utils.py
+++ b/src/semanticlayertools/visual/utils.py
@@ -6,10 +6,6 @@
 import numpy as np
 from scipy import stats
 
-from collections import Counter
-import plotly.express as px
-import plotly.graph_objects as go
-
 from sentence_transformers import SentenceTransformer
 import umap
 import hdbscan
@@ -24,7 +20,10 @@ def gaussian_smooth(x, y, grid, sd):
     return (weights * y).sum(1)
 
 
-def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000, showNthGrid: int=5):
+def streamgraph(
+    filepath: str, smooth: smoothing = False,
+    minClusterSize: int = 1000, showNthGrid: int = 5
+):
     """Plot streamgraph of cluster sizes vs years.
 
     Based on https://www.python-graph-gallery.com/streamchart-basic-matplotlib
@@ -59,8 +58,8 @@ def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000
             grid,
             y_smoothed,
             labels=cluDict.keys(),
-            baseline="sym"
-            ,colors=plt.get_cmap('tab20').colors
+            baseline="sym",
+            colors=plt.get_cmap('tab20').colors
         )
 
         pass
@@ -86,7 +85,10 @@ def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000
     return fig
 
 
-def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str):
+def embeddedTextPlotting(
+    infolderpath: str, columnName: str, outpath: str,
+    umapNeighors: int = 200,
+):
     """Create embedding for corpus text."""
     print('Initializing embedder model.')
     model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -113,7 +115,7 @@ def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str):
     )
     print('\tDone\nStarting mapping to 2D.')
     corpus_embeddings_2D = umap.UMAP(
-        n_neighbors=15,
+        n_neighbors=umapNeighors,
         n_components=2,
         metric='cosine'
     ).fit_transform(corpus_embeddings)
@@ -130,7 +132,9 @@ def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str):
 
 
 def embeddedTextClustering(
-    infolderpath: str, columnName: str, emdeddingspath: str, outpath: str
+    infolderpath: str, columnName: str, emdeddingspath: str, outpath: str,
+    umapNeighors: int = 200, umapComponents: int = 50,
+    hdbscanMinCluster: int = 500,
 ):
     """Create clustering based on embedding for corpus texts."""
     print('Initializing embedder model.')
@@ -149,8 +153,9 @@ def embeddedTextClustering(
     corpus_embeddings = torch.load(emdeddingspath)
     print('\tDone\nStarting mapping to lower dimensions.')
     corpus_embeddings_50D = umap.UMAP(
-        n_neighbors=15,
-        n_components=50,
+        n_neighbors=umapNeighors,
+        n_components=umapComponents,
+        min_dist=0.0,
         metric='cosine'
     ).fit_transform(corpus_embeddings)
     np.savetxt(
@@ -161,7 +166,7 @@ def embeddedTextClustering(
     )
     print('\tDone.\nStarting clustering.')
     cluster = hdbscan.HDBSCAN(
-        min_cluster_size=20,
+        min_cluster_size=hdbscanMinCluster,
         metric='euclidean',
         cluster_selection_method='eom'
     ).fit(corpus_embeddings_50D)
diff --git a/tox.ini b/tox.ini
index 24a7a07..5047f72 100644
--- a/tox.ini
+++ b/tox.ini
@@ -22,11 +22,5 @@ deps =
     sphinx
     sphinx_rtd_theme
     m2r2
-    plotly
-    hdbscan
-    umap-learn
-    torch
-    sentence-transformers
-    https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0.tar.gz#egg=en_core_web_lg
 commands = sphinx-build -d "{toxworkdir}/docs_doctree" docs "{toxworkdir}/docs_out" --color -W -bhtml {posargs}
            python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))'

From 2ed0677f42d04f062421223736ee973d02879b79 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Fri, 7 Jan 2022 11:36:30 +0100
Subject: [PATCH 29/53] bump version

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 6e49e3f..a18cfc9 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = semanticlayertools
-version = 0.0.3
+version = 0.0.4
 author = Malte Vogl
 author_email = mvogl@mpiwg-berlin.mpg.de
 description = Create semantic layers using different methods for word linking.

From ffbadc1a69c35a464d5a307e06dfa56d93b54134 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 24 Jan 2022 15:00:27 +0100
Subject: [PATCH 30/53] fix link in docs

---
 docs/visual.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/visual.rst b/docs/visual.rst
index e5e6eec..3a1b6f3 100644
--- a/docs/visual.rst
+++ b/docs/visual.rst
@@ -55,4 +55,4 @@ HDBSCAN clustering. Reuses previously generated embedding of corpus.
    )
 
 .. seealso ::
-    `HDBSCAN docs <hdbscan.readthedocs.io>`_
+    `HDBSCAN docs <https://hdbscan.readthedocs.io>`_

From 1e497e9114795c3a4042f45905af79831744b277 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 28 Feb 2022 14:28:37 +0100
Subject: [PATCH 31/53] linting and add cleaning to docs, extend docs for
 pipelines

---
 docs/pipelines.rst                            |  4 ++
 src/semanticlayertools/cleaning/text.py       | 24 ++++++++--
 src/semanticlayertools/clustering/infomap.py  |  2 +-
 src/semanticlayertools/clustering/leiden.py   |  2 +-
 src/semanticlayertools/clustering/reports.py  |  6 +--
 src/semanticlayertools/linkage/cocitation.py  | 10 ++--
 src/semanticlayertools/linkage/wordscore.py   | 47 ++++++++++---------
 .../pipelines/cocitetimeclusters.py           | 37 ++++++++++++++-
 .../pipelines/wordscorenet.py                 | 30 +++++++++++-
 tests/linkage/test_wordscore.py               |  1 +
 tox.ini                                       |  6 +++
 11 files changed, 131 insertions(+), 38 deletions(-)

diff --git a/docs/pipelines.rst b/docs/pipelines.rst
index 0c2ff90..511a6f4 100644
--- a/docs/pipelines.rst
+++ b/docs/pipelines.rst
@@ -1,11 +1,15 @@
 Pipelines for workflows
 =======================
 
+Cocitation clustering pipeline
+******************************
 .. automodule:: semanticlayertools.pipelines.cocitetimeclusters
    :members:
    :undoc-members:
 
 
+Wordscore-Multilayer pipeline
+*****************************
 .. automodule:: semanticlayertools.pipelines.wordscorenet
   :members:
   :undoc-members:
diff --git a/src/semanticlayertools/cleaning/text.py b/src/semanticlayertools/cleaning/text.py
index 7f3889e..dff5e7f 100644
--- a/src/semanticlayertools/cleaning/text.py
+++ b/src/semanticlayertools/cleaning/text.py
@@ -8,7 +8,17 @@
 
 
 def lemmaSpacy(text):
-    """Clean text in dataframe column."""
+    """Clean text using Spacy english language model.
+
+    A spacy doc is created using the text. For each token which is not a
+    stopword and longer then 3 letters the lemma is returned in lowered form.
+    For historical reasons, input can also be of the form
+    text = list("Actual text"), which sometimes results from data harvesting.
+    In these cases only the first element is considered!
+
+    :param text: Input text
+    :type text: str
+    """
     try:
         if isinstance(text, list):
             text = text[0]
@@ -17,12 +27,20 @@ def lemmaSpacy(text):
             [t.lemma_ for t in doc if not t.is_stop and len(t) > 3]
         )
         return tokens.lower()
-    except:
+    except Exception:
         raise
 
 
 def htmlTags(text):
-    """Remove html tags in text."""
+    """Reformat html tags in text using replacement list..
+
+    Some specific html formating leads to confusion with sentence and token
+    border detection. This method outputs the cleaned
+    text using a replacement list.
+
+    :param text: Input text
+    :type text: str
+    """
     if isinstance(text, list):
         text = text[0]
     for tagPair in [
diff --git a/src/semanticlayertools/clustering/infomap.py b/src/semanticlayertools/clustering/infomap.py
index 4bbabd2..318cc0d 100644
--- a/src/semanticlayertools/clustering/infomap.py
+++ b/src/semanticlayertools/clustering/infomap.py
@@ -67,7 +67,7 @@ def calcInfomap(self, inFilePath):
             if self.recreate is False:
                 raise OSError(
                     f'Files at {cluFilePath} or {ftreeFilePath} exists. Set recreate = True to rewrite files.'
-                    )
+                )
             if self.recreate is True:
                 os.remove(cluFilePath)
                 os.remove(ftreeFilePath)
diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py
index 2d2e561..bd39d9e 100644
--- a/src/semanticlayertools/clustering/leiden.py
+++ b/src/semanticlayertools/clustering/leiden.py
@@ -68,7 +68,7 @@ def __init__(
         for idx in tqdm(range(len(edgefiles)), leave=False):
             try:
                 year = re.findall(r'\d{4}', edgefiles[idx])[0]
-            except:
+            except Exception:
                 raise
             if timerange[0] <= int(year) <= timerange[1]:
                 graph = ig.Graph.Read_Pajek(os.path.join(inpath, edgefiles[idx]))
diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py
index 2cbcd53..d6ba223 100644
--- a/src/semanticlayertools/clustering/reports.py
+++ b/src/semanticlayertools/clustering/reports.py
@@ -102,7 +102,7 @@ def create_corpus(self, dataframe):
             try:
                 # text pre-processing
                 title = re.sub("\n", " ", title)
-                title = re.sub("[\r|\t|\x0c|\d+]", "", title)
+                title = re.sub("[\r|\t|\x0c|\d+]", "", title)  # noqa: W605
                 title = re.sub("[.,]", "", title)
                 title = re.sub("\\\'s", "'s", title)
                 title = title.lower()
@@ -112,7 +112,7 @@ def create_corpus(self, dataframe):
                 tokens_without_sw = ' '.join([t.lemma_ for t in doc if not t.is_stop])
 
                 docs.append(tokens_without_sw)
-            except:
+            except Exception:
                 print(title)
                 raise
 
@@ -269,7 +269,7 @@ def gatherClusterMetadata(self):
         for x in filenames:
             try:
                 year = int(re.findall(r'\d{4}', x)[0])
-            except:
+            except Exception:
                 raise
             if self.timerange[0] <= year <= self.timerange[1]:
                 yearFiles.append(x)
diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py
index 9d30fb6..e22bed2 100644
--- a/src/semanticlayertools/linkage/cocitation.py
+++ b/src/semanticlayertools/linkage/cocitation.py
@@ -110,7 +110,7 @@ def calculateCoCitation(self, filepath):
             tempG = ig.Graph.TupleList(sortCoCitCounts, weights=True, vertex_name_attr='id')
             components = tempG.components()
             sortedComponents = sorted(
-                [(x, len(x), len(x)*100/len(tempG.vs)) for x in components], key=lambda x: x[1], reverse=True
+                [(x, len(x), len(x) * 100 / len(tempG.vs)) for x in components], key=lambda x: x[1], reverse=True
             )
             with open(os.path.join(self.outpath, infilename + '_graphMetadata.txt'), 'w') as outfile:
                 outfile.write(f'Graph derived from {filepath}\nSummary:\n')
@@ -125,7 +125,7 @@ def calculateCoCitation(self, filepath):
                             elem[1],
                             elem[2],
                             len(gcompTemp.es),
-                            len(gcompTemp.es)*100/len(tempG.es)
+                            len(gcompTemp.es) * 100 / len(tempG.es)
                         )
             giantComponent = sortedComponents[0]
             giantComponentGraph = tempG.vs.select(giantComponent[0]).subgraph()
@@ -135,7 +135,7 @@ def calculateCoCitation(self, filepath):
             with open(os.path.join(self.outpath, infilename + '.ncol'), 'w') as outfile:
                 for edge in sortCoCitCounts:
                     outfile.write(f"{edge[0]} {edge[1]} {edge[2]}\n")
-        except:
+        except Exception:
             raise
         if self.debug == "l2":
             print(f'\tDone in {time.time() - starttime} seconds.')
@@ -153,7 +153,7 @@ def processFolder(self):
             for file in tqdm(os.listdir(self.inpath), leave=False):
                 try:
                     year = re.findall(r'\d{4}', file)[0]
-                except:
+                except Exception:
                     raise
                 if self.timerange[0] <= int(year) <= self.timerange[1]:
                     try:
@@ -163,7 +163,7 @@ def processFolder(self):
                         gcmetafile.write(
                             f'{year},{outtuple[0]},{outtuple[1]},{outtuple[2]},{outtuple[3]}\n'
                         )
-                    except:
+                    except Exception:
                         raise
         if self.debug is True:
             print(f'\tDone in {time.time() - starttime} seconds.')
diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py
index e36721b..36552c7 100644
--- a/src/semanticlayertools/linkage/wordscore.py
+++ b/src/semanticlayertools/linkage/wordscore.py
@@ -93,7 +93,7 @@ def getScore(self, target):
             lvalue = len(set(x for x in contains if x[1] == subgram))
             valueList.append((lvalue + 1) * (rvalue + 1))
         return {
-            target: 1/self.counts[target] * (np.prod(valueList)) ** (1 / (2.0 * len(target)))
+            target: 1 / self.counts[target] * (np.prod(valueList)) ** (1 / (2.0 * len(target)))
         }
 
     def _calcBatch(self, batch):
@@ -109,13 +109,13 @@ def run(self, write=False, outpath='./', recreate=False, limitCPUs=True):
         if self.debug is True:
             print(f'Found {len(self.uniqueNGrams)} unique {self.ngramEnd}-grams.')
         if limitCPUs is True:
-            ncores = int(cpu_count()*1/4)
+            ncores = int(cpu_count() * 1 / 4)
         else:
             ncores = cpu_count() - 2
         pool = Pool(ncores)
-        chunk_size = int(len(self.uniqueNGrams)/ncores)
+        chunk_size = int(len(self.uniqueNGrams) / ncores)
         batches = [
-            list(self.uniqueNGrams)[i:i+chunk_size] for i in range(0, len(self.uniqueNGrams), chunk_size)
+            list(self.uniqueNGrams)[i:i + chunk_size] for i in range(0, len(self.uniqueNGrams), chunk_size)
         ]
         ncoresResults = pool.map(self._calcBatch, batches)
         results = [x for y in ncoresResults for x in y]
@@ -133,7 +133,7 @@ def run(self, write=False, outpath='./', recreate=False, limitCPUs=True):
                     if recreate is False:
                         raise IOError(
                             f'File at {filePath} exists. Set recreate = True to rewrite file.'
-                            )
+                        )
                     if recreate is True:
                         os.remove(filePath)
                 with open(filePath, 'a') as yearfile:
@@ -241,7 +241,7 @@ def writeLinks(self, sl, scorePath, scoreLimit, outpath='./', recreate=False):
             if recreate is False:
                 raise IOError(
                     f'File at {filePath} exists. Set recreate = True to rewrite file.'
-                    )
+                )
             if recreate is True:
                 os.remove(filePath)
 
@@ -274,21 +274,23 @@ def writeLinks(self, sl, scorePath, scoreLimit, outpath='./', recreate=False):
                 if len(authors) >= 2:
                     # pairs = [x for x in combinations(authors, 2)]
                     for pair in combinations(authors, 2):  # pairs:
-                        file.write('{0} {1} {2} {3} 1\n'.format(
-                            1,
-                            self.nodeMap[pair[0]],
-                            1,
-                            self.nodeMap[pair[1]]
+                        file.write(
+                            '{0} {1} {2} {3} 1\n'.format(
+                                1,
+                                self.nodeMap[pair[0]],
+                                1,
+                                self.nodeMap[pair[1]]
                             )
                         )
                 for author in authors:
                     try:
                         authNr = self.nodeMap[author]
-                        file.write('{0} {1} {2} {3} 1\n'.format(
-                            1,
-                            authNr,
-                            2,
-                            paperNr
+                        file.write(
+                            '{0} {1} {2} {3} 1\n'.format(
+                                1,
+                                authNr,
+                                2,
+                                paperNr
                             )
                         )
                     except KeyError:
@@ -297,12 +299,13 @@ def writeLinks(self, sl, scorePath, scoreLimit, outpath='./', recreate=False):
                     try:
                         ngramNr = self.nodeMap[ngramrow[1]]
                         weight = ngramrow[2]
-                        file.write('{0} {1} {2} {3} {4}\n'.format(
-                            2,
-                            paperNr,
-                            3,
-                            ngramNr,
-                            weight
+                        file.write(
+                            '{0} {1} {2} {3} {4}\n'.format(
+                                2,
+                                paperNr,
+                                3,
+                                ngramNr,
+                                weight
                             )
                         )
                     except KeyError:
diff --git a/src/semanticlayertools/pipelines/cocitetimeclusters.py b/src/semanticlayertools/pipelines/cocitetimeclusters.py
index 1762507..ea64e64 100644
--- a/src/semanticlayertools/pipelines/cocitetimeclusters.py
+++ b/src/semanticlayertools/pipelines/cocitetimeclusters.py
@@ -1,4 +1,4 @@
-"""Runs all steps to create reports for cocite temporal network clustering."""
+
 import time
 import os
 import multiprocessing
@@ -23,6 +23,41 @@ def run(
     numberproc: int = num_processes,
     limitRefLength=False, debug=False
 ):
+    """Runs all steps of the temporal clustering pipepline.
+
+    Creates cocitation networks, finds temporal clusters, writes report files
+    for large clusters.
+
+    Default time range is 1945 to 2005. Minimal size for considered clusters is
+    1000 nodes. Lists of references are assumed to be contained in column
+    "reference".
+
+    By default this routine takes all available cpu cores. Limit this
+    to a lower value to allow parallel performance of other tasks.
+
+    :param inputFilepath:  Path to corpora input data
+    :type text: str
+    :param cociteOutpath: Output path for cocitation networks
+    :type text: str
+    :param timeclusterOutpath: Output path for time clusters
+    :type text: str
+    :param reportsOutpath: Output path for reports
+    :type text: str
+    :param resolution: Main parameter for the clustering quality function (Constant Pots Model)
+    :type resolution: float
+    :param intersliceCoupling: Coupling parameter between two year slices, also influences cluster detection
+    :type intersliceCoupling: float
+    :param minClusterSize: The minimal cluster size, above which clusters are considered (default=1000)
+    :type minClusterSize: int
+    :param timerange: Time range to evalute clusters for (usefull for limiting computation time, default = (1945, 2005))
+    :type timerange: tuple
+    :param referenceColumnName: Column name containing the references of a publication
+    :type referenceColumnName: str
+    :param numberProc: Number of CPUs the package is allowed to use (default=all)
+    :type numberProc: int
+    :param limitRefLength: Either False or integer giving the maximum number of references a considered publication is allowed to contain
+    :type limitRefLength: bool or int
+    """
     for path in [cociteOutpath, timeclusterOutpath, reportsOutpath]:
         os.makedirs(path)
     starttime = time.time()
diff --git a/src/semanticlayertools/pipelines/wordscorenet.py b/src/semanticlayertools/pipelines/wordscorenet.py
index 5996fda..4dd5669 100644
--- a/src/semanticlayertools/pipelines/wordscorenet.py
+++ b/src/semanticlayertools/pipelines/wordscorenet.py
@@ -1,4 +1,3 @@
-"""Runs all steps to create a multilayer network."""
 import tempfile
 from datetime import datetime
 import os
@@ -19,7 +18,34 @@ def run(
     ngramsize=5,
     scoreLimit=1.0
 ):
-    """Run all steps for multilayer network generation using wordscoring."""
+    """Run all steps for multilayer network generation using wordscoring.
+
+    Calculates word scoring for corpus documents, creates multilayer network
+    by linking co-authors, their publications and used ngrams and
+    calculates clusters for each timeslice using the infomap algorithm.
+
+    By default, temmporal folders are used such that only the found clusters
+    are returned.
+
+    For details of the ngram method refere to the module documentation.
+
+    :param dataframe: The input corpus dataframe.
+    :type dataframe: class:`pandas.DataFrame`
+    :param tempFiles: Use temporal files during the pipeline run.
+    :type tempFiles: bool
+    :param outpath: Path for writing resulting cluster data
+    :type outpath: str
+    :param textColumn: Column name to use for ngram calculation
+    :type textColumn: str
+    :param pubIDColumn: Column name to use for publication identification (assumend to be unique)
+    :type pubIDColumn: str
+    :param yearColumn: Column name for temporal ordering publications, used during writing the scoring files
+    :type yearColumn: str
+    :param ngramsize: Maximum of considered ngrams (default: 5-gram)
+    :type ngramsize: int
+    :param scoreLimit: Minimal weight in the full corpus to consider an ngram score (default: 1.0)
+    :type scoreLimit: float
+    """
 
     if tempFiles is True:
         basedir = tempfile.TemporaryDirectory().name
diff --git a/tests/linkage/test_wordscore.py b/tests/linkage/test_wordscore.py
index f40a60e..4eb8ce3 100644
--- a/tests/linkage/test_wordscore.py
+++ b/tests/linkage/test_wordscore.py
@@ -8,6 +8,7 @@
 
 df = pd.read_json(filePath)
 
+
 class TestCalculateScores(unittest.TestCase):
 
     def setUp(self):
diff --git a/tox.ini b/tox.ini
index 5047f72..1621a60 100644
--- a/tox.ini
+++ b/tox.ini
@@ -15,6 +15,12 @@ deps =
 commands_pre = python -m spacy download en_core_web_sm
 commands = pytest {posargs}
 
+[testenv:flake8]
+deps =
+    flake8
+commands =
+    flake8 --ignore=E501,E402,F401 src/semanticlayertools/ tests/
+
 [testenv:docs]
 description = invoke sphinx-build to build the HTML docs
 basepython = python3.9

From caee4324c89c1ced2bfaab021efb5b0cd88a5e7b Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 28 Feb 2022 14:40:01 +0100
Subject: [PATCH 32/53] add req corpus for readthedocs

---
 docs/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index c9c7e88..ff2bf99 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -2,3 +2,4 @@ sphinx
 sphinx_rtd_theme
 m2r2
 semanticlayertools
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl

From 3786649a87c163feb583412731ff911fced5dc3b Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 28 Feb 2022 14:46:15 +0100
Subject: [PATCH 33/53] bump version

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index a18cfc9..dacad42 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = semanticlayertools
-version = 0.0.4
+version = 0.0.5
 author = Malte Vogl
 author_email = mvogl@mpiwg-berlin.mpg.de
 description = Create semantic layers using different methods for word linking.

From 1702f8f6aa44453b3c12f91748c3ac6977c5f80b Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 28 Feb 2022 15:12:11 +0100
Subject: [PATCH 34/53] add docs req egg install

---
 docs/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index ff2bf99..1aeab91 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,5 +1,5 @@
+-e .
 sphinx
 sphinx_rtd_theme
 m2r2
-semanticlayertools
 en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl

From 92605e5799c92c2fa28ccf3fc7ca60a275240dde Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Wed, 2 Mar 2022 11:45:32 +0100
Subject: [PATCH 35/53] wip fix small data size vs cpu count

---
 src/semanticlayertools/linkage/cocitation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py
index e22bed2..4e69587 100644
--- a/src/semanticlayertools/linkage/cocitation.py
+++ b/src/semanticlayertools/linkage/cocitation.py
@@ -100,6 +100,8 @@ def calculateCoCitation(self, filepath):
         try:
             data = pd.read_json(filepath, lines=True).dropna(subset=[self.columnName])
             chunk_size = int(data.shape[0] / self.numberProc)
+            if chunk_size == 0:  # Deal with small data samples.
+                chunk_size = 1
             chunks = np.array_split(data, chunk_size)
             pool = multiprocessing.Pool(processes=self.numberProc)
             cocitations = pool.map(self.getCombinations, chunks)

From 016d02e634fa6685cefa5fc45ee8fdfc525d41e9 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Wed, 2 Mar 2022 11:53:27 +0100
Subject: [PATCH 36/53] upd version

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index dacad42..281cf44 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = semanticlayertools
-version = 0.0.5
+version = 0.1.1
 author = Malte Vogl
 author_email = mvogl@mpiwg-berlin.mpg.de
 description = Create semantic layers using different methods for word linking.

From 2c1eea51d6d745d712f07b097508fb91fe92b964 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Wed, 2 Mar 2022 14:53:04 +0100
Subject: [PATCH 37/53] wip fix nodeid -> self.publicationIDcolumn, check for
 module import error

---
 src/semanticlayertools/clustering/reports.py |  2 +-
 src/semanticlayertools/visual/utils.py       | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py
index d6ba223..fe871fd 100644
--- a/src/semanticlayertools/clustering/reports.py
+++ b/src/semanticlayertools/clustering/reports.py
@@ -190,7 +190,7 @@ def fullReport(self, cluster):
         dfCluster = pd.concat(clusterdf, ignore_index=True)
         basedf = self.clusternodes.query('cluster == @cluster')
         inputnodes = set(basedf.node.values)
-        notFound = inputnodes.difference(set(dfCluster.nodeID.values))
+        notFound = inputnodes.difference(set(dfCluster[self.publicationIDcolumn].values))
         topAuthors = Counter(
             [x for y in dfCluster[self.authorColumnName].fillna('').values for x in y]
         ).most_common(20)
diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py
index 85febac..6985c55 100644
--- a/src/semanticlayertools/visual/utils.py
+++ b/src/semanticlayertools/visual/utils.py
@@ -6,10 +6,14 @@
 import numpy as np
 from scipy import stats
 
-from sentence_transformers import SentenceTransformer
-import umap
-import hdbscan
-import torch
+try:
+    from sentence_transformers import SentenceTransformer
+    import umap
+    import hdbscan
+    import torch
+except ModuleNotFoundError as e:
+    print('Please install the dependencies for the visualization routines, using `pip install semanticlayertools[embeddml]`.')
+    raise e
 
 smoothing = TypeVar('smoothing', bool, float)
 

From a874d14725295f85562e146f3d666c0cde35465c Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Wed, 2 Mar 2022 15:08:30 +0100
Subject: [PATCH 38/53] fix: text column contains text in list form

---
 src/semanticlayertools/clustering/reports.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py
index fe871fd..531cd35 100644
--- a/src/semanticlayertools/clustering/reports.py
+++ b/src/semanticlayertools/clustering/reports.py
@@ -97,7 +97,7 @@ def create_corpus(self, dataframe):
         nlp = spacy.load(mainLanguageCorp)
 
         docs = []
-        titles = [x[0] for x in dataframe[self.textcolumn].values if type(x) == list]
+        titles = dataframe[self.textcolumn].values
         for title in tqdm(titles, leave=False):
             try:
                 # text pre-processing

From fb9b79ebe9c6455eb96bad111bfb63b60a7e5687 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Wed, 2 Mar 2022 15:20:07 +0100
Subject: [PATCH 39/53] fix: author and aff are joined by semicolon

---
 src/semanticlayertools/clustering/reports.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py
index 531cd35..7fe6752 100644
--- a/src/semanticlayertools/clustering/reports.py
+++ b/src/semanticlayertools/clustering/reports.py
@@ -192,17 +192,19 @@ def fullReport(self, cluster):
         inputnodes = set(basedf.node.values)
         notFound = inputnodes.difference(set(dfCluster[self.publicationIDcolumn].values))
         topAuthors = Counter(
-            [x for y in dfCluster[self.authorColumnName].fillna('').values for x in y]
-        ).most_common(20)
+            [x for y in [x.split(';') for x in dfCluster[self.authorColumnName].fillna('').values] for x in y]
+        ).most_common(21)
         authortext = ''
         for x in topAuthors:
-            authortext += f'\t{x[0]}: {x[1]}\n'
+            if x[0] != '':
+                authortext += f'\t{x[0]}: {x[1]}\n'
         topAffils = Counter(
-            [x for y in dfCluster[self.affiliationColumnName].fillna('').values for x in y]
+            [x for y in [x.split(';') for x in dfCluster[self.affiliationColumnName].fillna('').values] for x in y]
         ).most_common(21)
         affiltext = ''
-        for x in topAffils[1:]:
-            affiltext += f'\t{x[0]}: {x[1]}\n'
+        for x in topAffils:
+            if x[0] != '':
+                affiltext += f'\t{x[0]}: {x[1]}\n'
         print(f'\tFinished base report for cluster {cluster}.')
         corpus = self.create_corpus(dfCluster)
         warnings.simplefilter(action='ignore', category=FutureWarning)
@@ -281,5 +283,5 @@ def writeReports(self):
         """Generate reports and write to output path."""
         for cluster in tqdm(self.largeClusterList, leave=False):
             outtext = self.fullReport(cluster)
-            with open(f'{self.outpath}Cluster_{cluster}.txt', 'w') as file:
+            with open(f'{self.outpath}/Cluster_{cluster}.txt', 'w') as file:
                 file.write(outtext)

From 32d095b592fa7555fd0dcbdba96d41a01f212ffa Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 3 Mar 2022 14:40:51 +0100
Subject: [PATCH 40/53] add option to cluster full graphs

---
 docs/visual.rst                              |  1 +
 src/semanticlayertools/clustering/leiden.py  | 14 +++++++++++---
 src/semanticlayertools/clustering/reports.py |  3 ++-
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/docs/visual.rst b/docs/visual.rst
index 3a1b6f3..0513ae3 100644
--- a/docs/visual.rst
+++ b/docs/visual.rst
@@ -36,6 +36,7 @@ the SentenceTransformer approach of SBERT and UMAP. Time consuming method!
 
 .. seealso ::
     `SBERT docs <https://www.sbert.net/index.html>`_
+    
     `UMAP docs <https://umap-learn.readthedocs.io/en/latest/index.html>`_
 
 
diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py
index bd39d9e..7b70ec8 100644
--- a/src/semanticlayertools/clustering/leiden.py
+++ b/src/semanticlayertools/clustering/leiden.py
@@ -44,8 +44,10 @@ class TimeCluster():
 
     def __init__(
         self, inpath: str, outpath: str,
-        resolution: float = 0.003, intersliceCoupling: float = 0.4,
+        resolution: float = 0.003,
+        intersliceCoupling: float = 0.4,
         timerange: tuple = (1945, 2005),
+        useGC: bool = True,
     ):
         starttime = time.time()
         self.inpath = inpath
@@ -61,7 +63,10 @@ def __init__(
         if os.path.isfile(self.outfile):
             raise OSError(f'Output file at {self.outfile} exists. Aborting.')
 
-        edgefiles = [x for x in os.listdir(inpath) if x.endswith('_GC.net')]
+        if useGC is True:
+            edgefiles = [x for x in os.listdir(inpath) if x.endswith('_GC.net')]
+        elif useGC is False:
+            edgefiles = [x for x in os.listdir(inpath) if x.endswith('.ncol')]
 
         self.graphDict = {}
 
@@ -71,7 +76,10 @@ def __init__(
             except Exception:
                 raise
             if timerange[0] <= int(year) <= timerange[1]:
-                graph = ig.Graph.Read_Pajek(os.path.join(inpath, edgefiles[idx]))
+                if useGC is True:
+                    graph = ig.Graph.Read_Pajek(os.path.join(inpath, edgefiles[idx]))
+                elif useGC is False:
+                    graph = ig.Graph.Read_Ncol(os.path.join(inpath, edgefiles[idx]))
                 self.graphDict[year] = graph
 
         self.optimiser = la.Optimiser()
diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py
index 7fe6752..30be990 100644
--- a/src/semanticlayertools/clustering/reports.py
+++ b/src/semanticlayertools/clustering/reports.py
@@ -52,7 +52,8 @@ def __init__(
         authorColumnName: str = 'author',
         affiliationColumnName: str = 'aff',
         publicationIDcolumn: str = 'nodeID',
-        numberProc: int = num_processes, minClusterSize: int = 1000,
+        numberProc: int = num_processes,
+        minClusterSize: int = 1000,
         timerange: tuple = (1945, 2005)
     ):
         """Constructor method"""

From 788f07f072c27678e2bab2223623ec35bb5eced5 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 3 Mar 2022 18:06:08 +0100
Subject: [PATCH 41/53] include generateTree add authors, tox dep include
 embeddml

---
 AUTHORS.rst                                   |  14 +
 docs/authors.rst                              |   1 +
 docs/index.rst                                |   1 +
 docs/requirements.txt                         |   2 +-
 docs/visual.rst                               |  16 +-
 setup.cfg                                     |   3 +-
 .../visual/generateCitationTree.py            | 705 ++++++++++++++++++
 tox.ini                                       |   3 +-
 8 files changed, 741 insertions(+), 4 deletions(-)
 create mode 100644 AUTHORS.rst
 create mode 100644 docs/authors.rst
 create mode 100644 src/semanticlayertools/visual/generateCitationTree.py

diff --git a/AUTHORS.rst b/AUTHORS.rst
new file mode 100644
index 0000000..184b5ec
--- /dev/null
+++ b/AUTHORS.rst
@@ -0,0 +1,14 @@
+=======
+Credits
+=======
+
+Development Lead
+----------------
+
+* Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
+
+Contributors
+------------
+
+* Ira Kokoshko <ikokoshko@mpiwg-berlin.mpg.de>
+* Robert Egel <regel@mpiwg-berlin.mpg.de>
diff --git a/docs/authors.rst b/docs/authors.rst
new file mode 100644
index 0000000..e122f91
--- /dev/null
+++ b/docs/authors.rst
@@ -0,0 +1 @@
+.. include:: ../AUTHORS.rst
diff --git a/docs/index.rst b/docs/index.rst
index 64da186..22a566c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -26,6 +26,7 @@ funded by the Federal Ministry of Education and Research, Germany (Grant No. 01
    linkage
    clustering
    visual
+   authors
    license
 
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 1aeab91..e5f708f 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,4 +1,4 @@
--e .
+-e .[embeddml]
 sphinx
 sphinx_rtd_theme
 m2r2
diff --git a/docs/visual.rst b/docs/visual.rst
index 0513ae3..09e7e89 100644
--- a/docs/visual.rst
+++ b/docs/visual.rst
@@ -36,7 +36,7 @@ the SentenceTransformer approach of SBERT and UMAP. Time consuming method!
 
 .. seealso ::
     `SBERT docs <https://www.sbert.net/index.html>`_
-    
+
     `UMAP docs <https://umap-learn.readthedocs.io/en/latest/index.html>`_
 
 
@@ -57,3 +57,17 @@ HDBSCAN clustering. Reuses previously generated embedding of corpus.
 
 .. seealso ::
     `HDBSCAN docs <https://hdbscan.readthedocs.io>`_
+
+
+Generate citation and reference tree graph
+******************************************
+
+Using the Dimensions AI dataset, this routine generates a structure
+starting from a source publications, that represents its references and their
+references as well as its citations and their citations. With this means,
+visualizations of it show academic roots and conduits and can display
+disciplinary pathways.
+
+.. automodule:: semanticlayertools.visual.generateCitationTree
+  :members:
+  :undoc-members:
diff --git a/setup.cfg b/setup.cfg
index 281cf44..93a474f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = semanticlayertools
-version = 0.1.1
+version = 0.1.3
 author = Malte Vogl
 author_email = mvogl@mpiwg-berlin.mpg.de
 description = Create semantic layers using different methods for word linking.
@@ -42,6 +42,7 @@ dev =
   tox
   sphinx
 embeddml =
+  dimcli
   torch
   umap-learn
   hdbscan
diff --git a/src/semanticlayertools/visual/generateCitationTree.py b/src/semanticlayertools/visual/generateCitationTree.py
new file mode 100644
index 0000000..68ef71a
--- /dev/null
+++ b/src/semanticlayertools/visual/generateCitationTree.py
@@ -0,0 +1,705 @@
+"""
+generateCitationTree contains classes for generating citation trees for a single DOI.
+
+https://app.dimensions.ai is used in favor of CrossRef, as it contains richer information.
+Requires dimensions.ai API access.
+Results are compatible to the existing GenerateCitationNetwork module.
+"""
+import os
+import dimcli
+import pandas as pd
+import numpy as np
+import json
+from tqdm import tqdm
+import math
+from typing import Dict, Tuple, List
+import datetime
+import time
+import re
+from requests.exceptions import HTTPError
+
+# type aliases
+Doi = str
+PubID = str
+FilePath = str
+
+
+class generate:
+    """GenerateCitationNet makes citation/reference networks for a document.
+
+    For a given input document, its references and citations are evaluated. In
+    a second step, citations of citations and references of references are extracted.
+    This information is used to generate a tree like network.
+    """
+
+    def __init__(
+        self,
+        verbose: bool = False,
+        api_key="",
+        use_expanded_target_references: bool = False,
+    ):
+        """
+        __init__ instantiates citation network generator.
+
+        :param verbose: forwarded to dimcli queries, defaults to False
+        :type verbose: bool, optional
+        :param api_key: dimensions.ai API key, tries to use dsl.ini if not existent, defaults to ""
+        :type api_key: str, optional
+        :param use_expanded_target_references: whether or not to use indirect connections
+            (not through input node) to make network edges
+        :type use_expanded_target_references: bool, optional
+        """
+        while not dimcli.login_status():
+            try:
+                dimcli.login(key=api_key)
+            except HTTPError as e:
+                if e.response.status_code == 401:
+                    raise
+                time.sleep(5)
+                pass
+
+        self.dsl: dimcli.Dsl = dimcli.Dsl()
+        self._verbose: bool = verbose
+        self.startDoi: Doi = ""
+        self.stringClean = {r"\s": "__", "/": "_slash_", ":": "_colon_", r"\.": "_dot_"}
+        self._make_hairball = use_expanded_target_references
+
+    def fetchPubsByIDs(
+        self, pubIDs: List[PubID], authors: bool = True
+    ) -> Tuple[bool, pd.DataFrame]:
+        """
+        Fetch publications from dimcli using PubIDs defined by dimensions.ai.
+
+        :param pubIDs: list of PubIDs (string type alias)
+        :type pubIDs: List[PubID]
+        :param authors: whether to fetch author information, defaults to True
+        :type authors: bool, optional
+        :return: status-bool (True if everything is okay), dataframe containing
+            required information for input publications
+        :rtype: Tuple[bool, pd.DataFrame]
+        """
+        if self._verbose:
+            print(f"hi, this is fetchPubsByIDs() for pubs = {pubIDs}")
+
+        if authors:
+            query = f"""
+                    search publications
+                        where id in {json.dumps(pubIDs)}
+                    return publications[id+doi+title+category_for+year
+                        +reference_ids+authors+journal_title_raw+times_cited]
+                    limit {len(pubIDs)}
+                """
+        else:
+            query = f"""
+                    search publications
+                        where id in {json.dumps(pubIDs)}
+                    return publications[id+doi+title+category_for+year
+                        +reference_ids+journal_title_raw+times_cited]
+                    limit {len(pubIDs)}
+                """
+
+        dsl_data = self.dsl.query(query, verbose=self._verbose)
+
+        df = dsl_data.as_dataframe()
+
+        try:
+            df["target_refs"] = df["reference_ids"]
+        except (TypeError, KeyError):
+            return False, pd.DataFrame()
+
+        # replace NaN with empty list
+        df["target_refs"] = df["target_refs"].apply(
+            lambda target_ref: [] if type(target_ref) == float else target_ref
+        )
+
+        if not authors:
+            df["authors"] = [np.nan] * len(df)
+
+        return True, df
+
+    def fetchPubsByDois(
+        self, dois: List[Doi], authors: bool = True
+    ) -> Tuple[bool, pd.DataFrame]:
+        """
+        Fetch publications from dimcli using DOIs.
+
+        :param dois: list of DOIs (string type alias)
+        :type dois: List[Doi]
+        :param authors: whether to fetch author information, defaults to True
+        :type authors: bool, optional
+        :return: status-bool (True if everything is okay), dataframe containing
+            required information for input publications
+        :rtype: Tuple[bool, pd.DataFrame]
+        """
+        if self._verbose:
+            print(f"hi, this is fetchOrigin() for doi = {dois}")
+
+        if authors:
+            query = f"""
+                    search publications
+                        where doi in {json.dumps(dois)}
+                    return publications[id+doi+title+category_for+year
+                        +reference_ids+authors+journal_title_raw+times_cited]
+                """
+        else:
+            query = f"""
+                    search publications
+                        where doi in {json.dumps(dois)}
+                    return publications[id+doi+title+category_for+year
+                        +reference_ids+journal_title_raw+times_cited]
+                """
+
+        dsl_data = self.dsl.query(query, verbose=self._verbose)
+
+        df = dsl_data.as_dataframe()
+        try:
+            df["target_refs"] = df["reference_ids"]
+        except (TypeError, KeyError):
+            return False, pd.DataFrame()
+
+        # replace NaN with empty list
+        df["target_refs"] = df["target_refs"].apply(
+            lambda target_ref: [] if type(target_ref) == float else target_ref
+        )
+
+        if not authors:
+            df["authors"] = [np.nan] * len(df)
+
+        return True, df
+
+    def fetchCitations(
+        self, pubIDs: List[PubID], authors: bool = False
+    ) -> Tuple[bool, pd.DataFrame]:
+        """
+        Fetch citing publications for a list of publications using their PubIDs.
+
+        :param pubIDs: list of PubIDs (string type alias)
+        :type pubIDs: List[PubID]
+        :param authors: whether to fetch author information, defaults to False
+        :type authors: bool, optional
+        :return: status-bool (True if everything is okay), dataframe containing
+            required information for citing publications
+        :rtype: Tuple[bool, pd.DataFrame]
+        """
+        if self._verbose:
+            print(f"hi, this is fetchCitations() for pubs = {pubIDs}")
+
+        dfs = []
+        if math.ceil(len(pubIDs) / 512) > 1:
+            __range__ = tqdm(range(math.ceil(len(pubIDs) / 512)))
+        else:
+            __range__ = range(math.ceil(len(pubIDs) / 512))
+
+        for i in __range__:
+            # dimcli queries are limited to 512 entites per list for `in` filtering
+            offset = i * 512
+
+            if authors:
+                query = f"""
+                        search publications
+                            where reference_ids in {json.dumps(pubIDs[offset:offset+512])}
+                        return publications[id+doi+title+category_for+year
+                            +reference_ids+authors+journal_title_raw+times_cited]
+                    """
+            else:
+                query = f"""
+                        search publications
+                            where reference_ids in {json.dumps(pubIDs[offset:offset+512])}
+                        return publications[id+doi+title+category_for+year
+                            +reference_ids+journal_title_raw+times_cited]
+                    """
+
+            dsl_data = self.dsl.query_iterative(query, verbose=self._verbose)
+            tmp = dsl_data.as_dataframe()
+
+            try:
+                _ = tmp["reference_ids"]
+            except (TypeError, KeyError):
+                return False, pd.DataFrame()
+
+            dfs.append(tmp)
+
+        df = pd.concat(dfs)
+        # intersection of input pubIDs and references of each publication
+        df["target_refs"] = df["reference_ids"].apply(
+            lambda row_refs: list(set(pubIDs) & set(row_refs))
+        )
+
+        if not authors:
+            df["authors"] = [np.nan] * len(df)
+
+        return True, df
+
+    def fetchReferences(
+        self, pubIDs: List[PubID], authors: bool = True
+    ) -> Tuple[bool, pd.DataFrame]:
+        """
+        Fetch references for a list of publications using their PubIDs as defined by dimensions.ai.
+
+        :param pubIDs: list of PubIDs (string type alias)
+        :type pubIDs: List[PubID]
+        :param authors: whether to fetch author information, defaults to True
+        :type authors: bool, optional
+        :return: status-bool (True if everything is okay), dataframe containing
+            required information for references
+        :rtype: Tuple[bool, pd.DataFrame]
+        """
+        if self._verbose:
+            print(f"hi, this is fetchReferences() for pubs = {pubIDs}")
+
+        dfs = []
+
+        if math.ceil(len(pubIDs) / 512) > 1:
+            __range__ = tqdm(range(math.ceil(len(pubIDs) / 512)))
+        else:
+            __range__ = range(math.ceil(len(pubIDs) / 512))
+
+        # get references (PubID) of given PubIDs
+        for i in __range__:
+            # dimcli queries are limited to 512 entites per list for `in` filtering
+            offset = i * 512
+
+            query = f"""
+                    search publications
+                        where id in {json.dumps(pubIDs[offset:offset + 512])}
+                    return publications[id+reference_ids]
+                    limit 512
+                """
+
+            dsl_data = self.dsl.query(query, verbose=self._verbose)
+            tmp = dsl_data.as_dataframe()
+
+            try:
+                _ = tmp["reference_ids"]
+            except (TypeError, KeyError):
+                return False, pd.DataFrame()
+
+            dfs.append(tmp)
+
+        df0 = pd.concat(dfs)
+
+        # flatten list of references
+        # List[List[PubID]] -> List[PubID]
+        refs = [x for x in df0["reference_ids"].dropna().to_list() for x in x]
+
+        # drop duplicates
+        refs = list(set(refs))
+
+        dfs = []
+        if math.ceil(len(refs) / 512) > 1:
+            __range__ = tqdm(range(math.ceil(len(refs) / 512)))
+        else:
+            __range__ = range(math.ceil(len(refs) / 512))
+
+        for i in __range__:
+            # dimcli queries are limited to 512 entites per list for `in` filtering
+            offset = i * 512
+            ok, df = self.fetchPubsByIDs(refs[offset : offset + 512], authors=authors)
+            if ok:
+                dfs.append(df)
+            else:  # pragma: no cover
+                # cannot be reached unless dimensions database is malicious
+                return False, pd.DataFrame()
+
+        return True, pd.concat(dfs)
+
+    def run(
+        self, doi: Doi, levels_ref: int = 2, levels_cite: int = 2, authors: bool = False
+    ) -> Tuple[bool, pd.DataFrame]:
+        """
+        Generate citation network for a publication using its DOI.
+
+        :param doi: input DOI (string type alias)
+        :type doi: Doi
+        :param levels_ref: number of levels for references, defaults to 2
+        :type levels_ref: int, optional
+        :param levels_cite: number of levels for citing publications, defaults to 2
+        :type levels_cite: int, optional
+        :param authors: whether to include author information, defaults to False
+        :type authors: bool, optional
+        :return: status-bool (True if everything is okay), dataframe containing
+            required information for input publications, references and citing publications
+        :rtype: Tuple[bool, pd.DataFrame]
+        """
+        if hasattr(self, "result_df"):
+            return True, self.result_df
+
+        self.startDoi = doi
+
+        dfs = []
+
+        print("level 0")
+        ok, df_origin = self.fetchPubsByDois([doi], authors)
+        if not ok:
+            print(f"could not fetch publication for DOI {doi}")
+            return False, pd.DataFrame()
+
+        df_origin["level"] = 0
+        dfs.append(df_origin)
+
+        ok, dfs_cite = self._fetchCite(df_origin, levels_cite, authors)
+        ok, dfs_ref = self._fetchRef(df_origin, levels_ref, authors)
+
+        dfs.extend(dfs_cite + dfs_ref)
+
+        self.result_df: pd.DataFrame = pd.concat(dfs).reset_index(drop=True)
+
+        # cleaning
+        # self.result_df = self.dropDuplicates(self.result_df)
+        self.result_df["first_author"] = self.result_df["authors"].apply(
+            lambda authors: authors[0]["last_name"] if type(authors) == list else ""
+        )
+        self.result_df["ref_count"] = self.result_df["reference_ids"].apply(
+            lambda refs: len(refs) if type(refs) == list else None
+        )
+        self.result_df.index = self.result_df["id"]
+
+        self.main_node = df_origin.iloc[0].copy()
+        if type(self.main_node["authors"]) == list:
+            self.main_node["first_author"] = self.main_node["authors"][0]["last_name"]
+        else:
+            self.main_node["first_author"] = ""
+
+        self.result_df["main_category_for"] = self.result_df["category_for"].apply(
+            lambda c: [
+                x["name"]
+                for x in filter(lambda dict_: re.match(r"^\d\d\s", dict_["name"]), c)
+            ][0]
+            if type(c) == list
+            else ""
+        )
+
+        # replace NaN in reference_ids with empty list
+        self.result_df["reference_ids"] = self.result_df["reference_ids"].apply(
+            lambda target_ref: [] if type(target_ref) == float else target_ref
+        )
+
+        # include expanded target refs
+        # (intersection of references of a paper and all listed publications,
+        #  e.g. main_node cites A, Y cites main_node and A
+        #    -> target_refs does not contain connection from Y to A)
+        all_pubs = set(self.result_df.index)
+        self.result_df["expanded_target_refs"] = self.result_df["reference_ids"].apply(
+            lambda reference_ids: list(all_pubs.intersection(reference_ids))
+        )
+
+        return True, self.result_df
+
+    def _fetchCite(
+        self, df_origin: pd.DataFrame, levels: int, authors: bool
+    ) -> Tuple[bool, List[pd.DataFrame]]:
+        dfs_cite = []
+
+        pubIDs = df_origin["id"].to_list()
+        for i in range(levels):
+            print(f"level {i + 1}, fetching citations for {len(pubIDs)} publications")
+            ok, tmp = self.fetchCitations(pubIDs, authors)
+            if ok:
+                pubIDs = tmp["id"].to_list()
+                tmp["level"] = i + 1
+                dfs_cite.append(tmp)
+            else:  # pragma: no cover
+                # cannot be reached unless dimensions database is malicious
+                return False, pd.DataFrame()
+        return True, dfs_cite
+
+    def _fetchRef(
+        self, df_origin: pd.DataFrame, levels: int, authors: bool
+    ) -> Tuple[bool, List[pd.DataFrame]]:
+        dfs_ref = []
+        pubIDs = df_origin["id"].to_list()
+        for i in range(levels):
+            print(
+                f"level {(i + 1) * (-1)}, fetching references for {len(pubIDs)} publications"
+            )
+            ok, tmp = self.fetchReferences(pubIDs, authors)
+            if ok:
+                pubIDs = tmp["id"].to_list()
+                tmp["level"] = (i + 1) * (-1)
+                dfs_ref.append(tmp)
+            else:  # pragma: no cover
+                # cannot be reached unless dimensions database is malicious
+                return False, pd.DataFrame()
+        return True, dfs_ref
+
+    def _makeCompatibleRefDf(
+        self, df: pd.DataFrame, use_expanded: bool = False
+    ) -> pd.DataFrame:
+        """
+        Reformat references dataframe to match prior versions formatting.
+
+        :param df: dataframe as generated by .fetchReferences()
+        :type df: pd.DataFrame
+        :return: compatible dataframe
+        :rtype: pd.DataFrame
+        """
+        levels_ref = min(df["level"])
+
+        # flatten references
+        if use_expanded:
+            target_ref_type = "expanded_target_refs"
+        else:
+            target_ref_type = "target_refs"
+
+        ref_tuples = {
+            (row["id"], ref)
+            for _, row in df.query(f"{levels_ref} < level <= 0").iterrows()
+            for ref in row[target_ref_type]
+        }
+
+        df = df[~df.index.duplicated(keep="first")]
+
+        refs = []
+        for (source_id, target_id) in ref_tuples:
+            source = df.loc[source_id]
+            target = df.loc[target_id]
+
+            refs.append(
+                {
+                    "type": "reference",
+                    "sourceYear": source["year"],
+                    "sourceDOI": source["doi"],
+                    "sourcePubID": source["id"],
+                    "sourceJournal": source["journal_title_raw"],
+                    "targetFull": "",
+                    "targetYear": target["year"],
+                    "targetDOI": target["doi"],
+                    "targetPubID": target["id"],
+                    "targetrefCount": target["ref_count"],
+                    "targetis_ref_byCount": target["times_cited"],
+                    "targettitleStr": target["title"],
+                    "targetFirstAuthor": target["first_author"],
+                    "targetJournal": target["journal_title_raw"],
+                    "targetSubject": target["category_for"],
+                }
+            )
+
+        return pd.DataFrame(refs)
+
+    def _makeCompatibleCiteDf(
+        self, df: pd.DataFrame, use_expanded: bool = False
+    ) -> pd.DataFrame:
+        """
+        Reformat citation dataframe to match prior versions formatting.
+
+        :param df: dataframe as generated by .fetchCitations()
+        :type df: pd.DataFrame
+        :return: compatible dataframe
+        :rtype: pd.DataFrame
+        """
+        levels_cite = max(df["level"])
+
+        # flatten citations
+        if use_expanded:
+            target_ref_type = "expanded_target_refs"
+        else:
+            target_ref_type = "target_refs"
+
+        cite_tuples = {
+            (row["id"], ref)
+            for _, row in df.query(f"{levels_cite} >= level > 0").iterrows()
+            for ref in row[target_ref_type]
+        }
+
+        df = df[~df.index.duplicated(keep="first")]
+
+        cites = []
+        for (source_id, target_id) in cite_tuples:
+            source = df.loc[source_id]
+            target = df.loc[target_id]
+
+            cites.append(
+                {
+                    "type": "citation",
+                    "targetPubID": target["id"],
+                    "targetYear": target["year"],
+                    "targetDOI": target["doi"],
+                    "targetJournal": target["journal_title_raw"],
+                    "sourceYear": source["year"],
+                    "sourceDOI": source["doi"],
+                    "sourcePubID": source["id"],
+                    "sourcerefCount": source["ref_count"],
+                    "sourceis_ref_byCount": source["times_cited"],
+                    "sourcetitleStr": source["title"],
+                    "sourceFirstAuthor": source["first_author"],
+                    "sourceJournal": source["journal_title_raw"],
+                    "sourceSubject": source["category_for"],
+                }
+            )
+
+        return pd.DataFrame(cites)
+
+    def makeCompatibleDf(self) -> Tuple[bool, pd.DataFrame]:
+        """
+        Reformat dataframe to match prior versions formatting.
+
+        :param df: dataframe as generated by .run()
+        :type df: pd.DataFrame
+        :return: compatible dataframe
+        :rtype: pd.DataFrame
+        """
+        if not hasattr(self, "result_df"):
+            print("you gotta run .run() first")
+            return False, pd.DataFrame()
+
+        if hasattr(self, "compatible_result_df"):
+            return True, self.compatible_result_df
+
+        df_ref = self._makeCompatibleRefDf(self.result_df, self._make_hairball)
+        df_cite = self._makeCompatibleCiteDf(self.result_df, self._make_hairball)
+
+        self.compatible_result_df: pd.DataFrame = pd.concat(
+            [df_cite, df_ref], ignore_index=True
+        ).fillna("")
+
+        return True, self.compatible_result_df
+
+    def runCompatible(
+        self,
+        doi: Doi,
+        level: int = 2,
+        direct: str = "both",
+        debug: bool = False,
+    ) -> Tuple[bool, pd.DataFrame]:
+        """
+        Wrap .run() with same parameters and outputs as prior versions.
+
+        :param doi: input DOI (string type alias)
+        :type doi: Doi
+        :param level: number of levels to fetch, defaults to 2
+        :type level: int, optional
+        :param direct: direction of search (either "ref", "cite" or "both"), defaults to "both"
+        :type direct: str, optional
+        :param debug: [description], defaults to False
+        :type debug: bool, optional
+        :return: [description]
+        :rtype: Tuple[bool, pd.DataFrame]
+        """
+        if direct == "ref":
+            ok, df = self.run(doi, levels_ref=level, levels_cite=0)
+        elif direct == "cite":
+            ok, df = self.run(doi, levels_ref=0, levels_cite=level)
+        elif direct == "both":
+            ok, df = self.run(doi, levels_ref=level, levels_cite=level)
+        else:
+            print("provide proper direction of search (either `ref`, `cite` or `both`)")
+            return False, pd.DataFrame()
+
+        if not ok:
+            return False, pd.DataFrame()
+
+        ok, comp_df = self.makeCompatibleDf()
+        if ok:
+            return True, comp_df
+        else:
+            return False, pd.DataFrame()
+
+    def _nodeDict(self, row: pd.Series) -> Dict:
+        # row = row.fillna("")
+
+        if row["doi"].lower() == self.startDoi.lower():
+            inputDOI = "True"
+        else:
+            inputDOI = "False"
+        res = {
+            #   "label": nodeName,
+            #   "x": 0,
+            #   "y": 0,
+            "id": row["id"],
+            "attributes": {
+                # "name": nodeName,
+                "title": row["title"],
+                "doi": row["doi"],
+                "nodeyear": row["year"],
+                "ref-by-count": row["times_cited"],
+                "is_input_DOI": inputDOI,
+                "category_for": row["main_category_for"],
+                "level": row["level"],
+            },
+            #   "color": "rgb(0,0,0)",
+            #   "size": 10
+        }
+        return res
+
+    def _edgeDict(self, row: pd.Series) -> Dict:
+        # row = row.fillna("")
+
+        res = {
+            "source": row["sourcePubID"],
+            "target": row["targetPubID"],
+            #   "id": idx,
+            "attributes": {"year": row["sourceYear"], "type": row["type"]},
+            #   "color": "rgb(0,0,0)",
+            #   "size": 1
+        }
+        return res
+
+    def _createFilename(self, ext: str = "json") -> FilePath:
+        filename = self.startDoi
+        date = datetime.datetime.now().strftime("%Y-%m-%d")
+        for key, val in self.stringClean.items():
+            filename = re.sub(key, val, filename)
+        if self._make_hairball:
+            path = f"{self.main_node['first_author']}_{filename}_date_{date}_hairball.{ext}"
+        else:
+            path = f"{self.main_node['first_author']}_{filename}_date_{date}.{ext}"
+        return path
+
+    def createJSON(self, outputPath: FilePath = "./out") -> Tuple[bool, FilePath]:
+        """
+        Create JSON file on disk containing network as lists of nodes and edges for visualization.
+
+        :param outputPath: output directory, defaults to "./out"
+        :type outputPath: FilePath, optional
+        :return: status-bool (True if everything is okay), path of JSON file
+        :rtype: Tuple[bool, FilePath]
+        """
+        if not hasattr(self, "result_df"):
+            print("You need to use .run() first to create some data to write.")
+            return False, ""
+
+        if not hasattr(self, "compatible_result_df"):
+            self.makeCompatibleDf()
+
+        allNodes = [
+            x
+            for _, x in self.result_df[~self.result_df.index.duplicated()]
+            .fillna("")
+            .iterrows()
+        ]
+        allRows = [x for x in self.compatible_result_df.fillna("").iterrows()]
+
+        outputPath = os.path.abspath(outputPath)
+        if not os.path.exists(outputPath):
+            os.mkdir(outputPath)
+
+        with open(f"{outputPath}/{self._createFilename()}", "w") as outFile:
+            # write nodes
+            outFile.write('{\n  "nodes": [\n')
+
+            # write nodes from compatible_result_df/allNodes
+            while allNodes:
+                node = allNodes.pop()
+                if len(allNodes) == 0:
+                    outFile.write(json.dumps(self._nodeDict(node)) + "\n")
+                else:
+                    outFile.write(json.dumps(self._nodeDict(node)) + ",\n")
+
+            # write edges
+            outFile.write('  ],\n  "edges":[')
+            while allRows:
+                idx, edge = allRows.pop()
+                if len(allRows) == 0:
+                    outFile.write(json.dumps(self._edgeDict(edge)) + "\n")
+                else:
+                    x = self._edgeDict(edge)
+                    outFile.write(json.dumps(x) + ",\n")
+            outFile.write("  ]\n}")
+
+        return True, f"{outputPath}/{self._createFilename()}"
+
+    def logout(self) -> None:
+        """
+        Dimcli logout.
+        """
+        dimcli.logout()
diff --git a/tox.ini b/tox.ini
index 1621a60..e7173e7 100644
--- a/tox.ini
+++ b/tox.ini
@@ -11,7 +11,7 @@ testpaths =
 [testenv]
 deps =
     pytest
-    -rrequirements.txt
+    -e ./[embeddml]
 commands_pre = python -m spacy download en_core_web_sm
 commands = pytest {posargs}
 
@@ -25,6 +25,7 @@ commands =
 description = invoke sphinx-build to build the HTML docs
 basepython = python3.9
 deps =
+    -e ./[embeddml]
     sphinx
     sphinx_rtd_theme
     m2r2

From 0d4371c1ec2f51436e5cd763ccab4d4d6f95958e Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 7 Mar 2022 16:05:08 +0100
Subject: [PATCH 42/53] wip updt org

---
 src/semanticlayertools/visual/generateCitationTree.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/semanticlayertools/visual/generateCitationTree.py b/src/semanticlayertools/visual/generateCitationTree.py
index 68ef71a..69c7b04 100644
--- a/src/semanticlayertools/visual/generateCitationTree.py
+++ b/src/semanticlayertools/visual/generateCitationTree.py
@@ -24,7 +24,7 @@
 FilePath = str
 
 
-class generate:
+class generateTree:
     """GenerateCitationNet makes citation/reference networks for a document.
 
     For a given input document, its references and citations are evaluated. In
@@ -310,7 +310,7 @@ def run(
         Generate citation network for a publication using its DOI.
 
         :param doi: input DOI (string type alias)
-        :type doi: Doi
+        :type doi: Drunoi
         :param levels_ref: number of levels for references, defaults to 2
         :type levels_ref: int, optional
         :param levels_cite: number of levels for citing publications, defaults to 2

From 69ea04d226765c10f15bcb9dca4d18048105673e Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Tue, 15 Mar 2022 15:13:34 +0100
Subject: [PATCH 43/53] add citationet working data generation

---
 src/semanticlayertools/visual/citationnet.py | 235 +++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 src/semanticlayertools/visual/citationnet.py

diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py
new file mode 100644
index 0000000..6201657
--- /dev/null
+++ b/src/semanticlayertools/visual/citationnet.py
@@ -0,0 +1,235 @@
+import dimcli
+import json
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import time
+from collections import Counter
+from requests.exceptions import HTTPError
+
+
+class GenerateTree:
+    """Generate tree for citationent visualization.
+
+    For a given input document, its references and citations are evaluated. In
+    a second step, citations of citations and references of references are
+    extracted. This information is used to generate a tree like network for
+    visualization.
+    """
+
+    def __init__(self, verbose: bool = False, api_key=""):
+        """Init module."""
+        while not dimcli.login_status():
+            try:
+                dimcli.login(key=api_key)
+            except HTTPError as e:
+                if e.response.status_code == 401:
+                    raise
+                time.sleep(5)
+                pass
+
+        self.dsl: dimcli.Dsl = dimcli.Dsl()
+        self._verbose = verbose
+        self.startDoi: str = ""
+        self.citationLimit: int = 100
+        self.dataframeList = []
+
+        self.stringClean = {
+            r"\s": "__",
+            "/": "_slash_",
+            ":": "_colon_",
+            r"\.": "_dot_"
+        }
+
+    def _formatFOR(self, row):
+        """Format existing FOR codes.
+
+        Each publication has a total value of one. Only first level parts of
+        codes are counted. If no FOR code exist, return '00:1'.
+
+        Example: "02, 0201, 0204, 06" yields "02:0.75;06:025"
+        """
+        try:
+            inputForcodes = [x['name'][:2] for x in row]
+            forcodes = ';'.join(
+                [f'{x[0]}:{x[1]/len(inputForcodes):.2f}' for x in Counter(
+                    inputForcodes
+                ).most_common()]
+            )
+        except TypeError:
+            forcodes = '00:1'
+        return forcodes
+
+    def _editDF(self, inputdf, dftype='cite_l1', level2List=None):
+        """Return reformated dataframe. """
+        retCols = ['source', 'target', 'doi', 'year', 'title', 'times_cited', 'forcodes', 'level', 'is_input']
+        formatedFOR = inputdf.category_for.apply(lambda row: self._formatFOR(row))
+        inputdf.insert(0, 'forcodes', formatedFOR)
+        inputdf.drop(['category_for'], axis=1, inplace=True)
+        inputdf.rename(columns={'id': 'source'}, inplace=True)
+        if dftype in ['ref_l1', 'cite_l2', 'ref_l2']:
+            outdf = inputdf.explode('reference_ids')
+            outdf.rename(columns={'reference_ids': 'target'}, inplace=True)
+            if dftype == 'cite_l2':
+                outdf = outdf.query('target.isin(@level2List)')
+        elif dftype == 'cite_l1':
+            inputdf.insert(0, 'target', self.pubids)
+            outdf = inputdf.copy()
+        outdf.insert(0, 'level', dftype)
+        outdf = outdf.dropna(subset=['source', 'target'])
+        outdf.insert(
+            0,
+            'is_input',
+            outdf.source.apply(lambda x: x == self.pubids)
+        )
+        return outdf[retCols]
+
+    def _getMissing(self, idlist):
+        """Get metadata for second level reference nodes."""
+        retCols = ['source', 'doi', 'year', 'title', 'times_cited', 'forcodes', 'level', 'is_input']
+        dfList = []
+        if len(idlist) > 512:
+            for partlist in tqdm(np.array_split(idlist, round(len(idlist)/400))):
+                res = self.dsl.query_iterative(
+                    f"""search
+                          publications
+                        where
+                          id in {json.dumps(list(partlist))}
+                        return
+                          publications[id+doi+times_cited+category_for+title+year]
+                    """,
+                    verbose=self._verbose
+                )
+                dfList.append(res.as_dataframe())
+            retDF = pd.concat(dfList)
+        else:
+            res = self.dsl.query_iterative(
+                f"""search
+                      publications
+                    where
+                      id in {json.dumps(list(idlist))}
+                    return
+                      publications[id+doi+times_cited+category_for+title+year]
+                """,
+                verbose=self._verbose
+            )
+            retDF = res.as_dataframe()
+        formatedFOR = retDF.category_for.apply(lambda row: self._formatFOR(row))
+        retDF.insert(0, 'forcodes', formatedFOR)
+        retDF.drop(['category_for'], axis=1, inplace=True)
+        retDF.rename(columns={'id': 'source'}, inplace=True)
+        retDF.insert(0, 'level', 'ref_l2')
+        retDF.insert(0, 'is_input', False)
+        return retDF[retCols]
+
+    def query(self, startDoi=''):
+        self.startDoi = startDoi
+        starttime = time.time()
+        doi2id = self.dsl.query(
+            f"""search
+                  publications
+                where
+                  doi = "{startDoi}" and times_cited <= {self.citationLimit}
+                return
+                  publications[id+doi+times_cited+category_for+title+year+reference_ids]
+            """,
+            verbose=self._verbose
+        )
+        querydf = doi2id.as_dataframe()
+        if querydf.shape[0] > 0:
+            self.pubids = querydf['id'].values[0]
+            self.pubrefs = list(
+                [x for y in querydf['reference_ids'].values for x in y]
+            )
+            self.dataframeList.append(
+                self._editDF(querydf, dftype="ref_l1")
+            )
+            ref1trgtList = list(self.dataframeList[0].target.values)
+            cit1df = self.dsl.query_iterative(
+                f"""search
+                      publications
+                    where
+                      reference_ids = "{self.pubids}"
+                    return
+                      publications[id+doi+times_cited+category_for+title+year+reference_ids]
+                """,
+                verbose=self._verbose)
+            self.dataframeList.append(
+                self._editDF(cit1df.as_dataframe(), dftype='cite_l1')
+            )
+            cit1SrcList = list(self.dataframeList[1].source.values)
+            cit2df = self.dsl.query_iterative(
+                f"""search
+                      publications
+                    where
+                      reference_ids in {json.dumps(cit1SrcList)}
+                    return
+                      publications[id+doi+times_cited+category_for+title+year+reference_ids]""",
+                verbose=self._verbose
+            )
+            self.dataframeList.append(
+                self._editDF(cit2df.as_dataframe(), dftype='cite_l2', level2List=cit1SrcList)
+            )
+            ref2df = self.dsl.query_iterative(
+                f"""search
+                      publications
+                    where
+                      id in {json.dumps(ref1trgtList)}
+                    return
+                      publications[id+doi+times_cited+category_for+title+year+reference_ids]""",
+                verbose=self._verbose
+            )
+            self.dataframeList.append(
+                self._editDF(ref2df.as_dataframe(), dftype='ref_l2')
+            )
+            print(f'Finished queries in {time.time() - starttime} seconds.')
+            return self
+        else:
+            print('The requested DOI is cited to often.')
+
+    def returnLinks(self):
+        return pd.concat(self.dataframeList)
+
+    def generateNetworkFiles(self, outpath):
+        starttime = time.time()
+        outformat = {'nodes': [], 'edges': []}
+        dflinks = pd.concat(self.dataframeList)
+        srcNodes = dflinks.source.unique()
+        trgNodes = [x for x in dflinks.target.unique() if x not in srcNodes]
+        nodeMetadata = pd.concat(
+            [
+                dflinks.drop('target', axis=1).drop_duplicates(),
+                self._getMissing(trgNodes)
+            ]
+        )
+        for idx, row in nodeMetadata.iterrows():
+            outformat['nodes'].append(
+                {
+                    'id': row['source'],
+                    'attributes':
+                        {
+                            "title": row["title"],
+                            "doi": row["doi"],
+                            "nodeyear": row["year"],
+                            "ref-by-count": row["times_cited"],
+                            "is_input_DOI": row['is_input'],
+                            "category_for": row["forcodes"],
+                            'level': row['level']
+                        }
+                }
+            )
+        for idx, row in dflinks.iterrows():
+            outformat['edges'].append(
+                {
+                    'source': row['source'],
+                    'target': row['target'],
+                    'attributes':
+                        {
+                            'year': row['year'],
+                            'level': row['level']
+                        }
+                }
+            )
+        with open(outpath, 'w') as outfile:
+            json.dump(outformat, outfile, indent=4)
+        return f'Finished querying extra metadata in {time.time() - starttime} seconds.'

From eaa2d0fdef4e7bc68b93d844ba9097ba8fbd6218 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 17 Mar 2022 18:00:43 +0100
Subject: [PATCH 44/53] wip add cleaning routine for title strings to make data
 json friendly

---
 src/semanticlayertools/visual/citationnet.py | 31 +++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py
index 6201657..602fd93 100644
--- a/src/semanticlayertools/visual/citationnet.py
+++ b/src/semanticlayertools/visual/citationnet.py
@@ -4,6 +4,8 @@
 import pandas as pd
 from tqdm import tqdm
 import time
+import re
+import os
 from collections import Counter
 from requests.exceptions import HTTPError
 
@@ -41,6 +43,19 @@ def __init__(self, verbose: bool = False, api_key=""):
             r"\.": "_dot_"
         }
 
+    def _cleanTitleString(self, row):
+        """Clean non-JSON characters from titles.
+
+        Removes newline characters and double backslashes.
+        """
+        try:
+            title = row
+            for pair in [('\n', ' '), (r':?\\+', '')]:
+                title = re.sub(pair[0], pair[1], title)
+            return title
+        except Exception:
+            return 'Can not process title.'
+
     def _formatFOR(self, row):
         """Format existing FOR codes.
 
@@ -82,6 +97,9 @@ def _editDF(self, inputdf, dftype='cite_l1', level2List=None):
             'is_input',
             outdf.source.apply(lambda x: x == self.pubids)
         )
+        cleantitle = outdf.title.apply(lambda row: self._cleanTitleString(row))
+        outdf.drop('title', axis=1, inplace=True)
+        outdf.insert(0, 'title', cleantitle)
         return outdf[retCols]
 
     def _getMissing(self, idlist):
@@ -190,7 +208,7 @@ def query(self, startDoi=''):
     def returnLinks(self):
         return pd.concat(self.dataframeList)
 
-    def generateNetworkFiles(self, outpath):
+    def generateNetworkFiles(self, outfolder):
         starttime = time.time()
         outformat = {'nodes': [], 'edges': []}
         dflinks = pd.concat(self.dataframeList)
@@ -208,7 +226,7 @@ def generateNetworkFiles(self, outpath):
                     'id': row['source'],
                     'attributes':
                         {
-                            "title": row["title"],
+                            "title": row['title'],
                             "doi": row["doi"],
                             "nodeyear": row["year"],
                             "ref-by-count": row["times_cited"],
@@ -230,6 +248,11 @@ def generateNetworkFiles(self, outpath):
                         }
                 }
             )
-        with open(outpath, 'w') as outfile:
-            json.dump(outformat, outfile, indent=4)
+        doiname = self.startDoi
+        for key, val in self.stringClean.items():
+            doiname = re.sub(key, val, doiname)
+
+        outfile = os.path.join(outfolder, doiname + '.json')
+        with open(outfile, 'w', encoding="utf8") as ofile:
+            json.dump(outformat, ofile, ensure_ascii=False)
         return f'Finished querying extra metadata in {time.time() - starttime} seconds.'

From b16514816d6b3ee21c41b2b1237b448012bd4115 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Fri, 18 Mar 2022 15:07:51 +0100
Subject: [PATCH 45/53] add cleaning of "

---
 src/semanticlayertools/visual/citationnet.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py
index 602fd93..f72745d 100644
--- a/src/semanticlayertools/visual/citationnet.py
+++ b/src/semanticlayertools/visual/citationnet.py
@@ -50,7 +50,7 @@ def _cleanTitleString(self, row):
         """
         try:
             title = row
-            for pair in [('\n', ' '), (r':?\\+', '')]:
+            for pair in [('\n', ' '), (r':?\\+', ''), ('"', '')]:
                 title = re.sub(pair[0], pair[1], title)
             return title
         except Exception:
@@ -136,8 +136,11 @@ def _getMissing(self, idlist):
         retDF.insert(0, 'forcodes', formatedFOR)
         retDF.drop(['category_for'], axis=1, inplace=True)
         retDF.rename(columns={'id': 'source'}, inplace=True)
-        retDF.insert(0, 'level', 'ref_l2')
+        retDF.insert(0, 'level', 'ref_l3')
         retDF.insert(0, 'is_input', False)
+        cleantitle = retDF.title.apply(lambda row: self._cleanTitleString(row))
+        retDF.drop('title', axis=1, inplace=True)
+        retDF.insert(0, 'title', cleantitle)
         return retDF[retCols]
 
     def query(self, startDoi=''):
@@ -220,7 +223,7 @@ def generateNetworkFiles(self, outfolder):
                 self._getMissing(trgNodes)
             ]
         )
-        for idx, row in nodeMetadata.iterrows():
+        for idx, row in nodeMetadata.fillna('').iterrows():
             outformat['nodes'].append(
                 {
                     'id': row['source'],
@@ -236,7 +239,7 @@ def generateNetworkFiles(self, outfolder):
                         }
                 }
             )
-        for idx, row in dflinks.iterrows():
+        for idx, row in dflinks.fillna('').iterrows():
             outformat['edges'].append(
                 {
                     'source': row['source'],
@@ -254,5 +257,5 @@ def generateNetworkFiles(self, outfolder):
 
         outfile = os.path.join(outfolder, doiname + '.json')
         with open(outfile, 'w', encoding="utf8") as ofile:
-            json.dump(outformat, ofile, ensure_ascii=False)
+            json.dump(outformat, ofile, indent=4, ensure_ascii=True)
         return f'Finished querying extra metadata in {time.time() - starttime} seconds.'

From 0c0419ffca29835ea690b0c3f4cdb4b9bd4ee959 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 21 Mar 2022 13:37:07 +0100
Subject: [PATCH 46/53] add first author name to filename

---
 src/semanticlayertools/visual/citationnet.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py
index f72745d..69f3a9c 100644
--- a/src/semanticlayertools/visual/citationnet.py
+++ b/src/semanticlayertools/visual/citationnet.py
@@ -33,6 +33,7 @@ def __init__(self, verbose: bool = False, api_key=""):
         self.dsl: dimcli.Dsl = dimcli.Dsl()
         self._verbose = verbose
         self.startDoi: str = ""
+        self.firstAuthor: str = "NoAuthor"
         self.citationLimit: int = 100
         self.dataframeList = []
 
@@ -145,6 +146,7 @@ def _getMissing(self, idlist):
 
     def query(self, startDoi=''):
         self.startDoi = startDoi
+        self.dataframeList = []
         starttime = time.time()
         doi2id = self.dsl.query(
             f"""search
@@ -152,11 +154,15 @@ def query(self, startDoi=''):
                 where
                   doi = "{startDoi}" and times_cited <= {self.citationLimit}
                 return
-                  publications[id+doi+times_cited+category_for+title+year+reference_ids]
+                  publications[id+authors+doi+times_cited+category_for+title+year+reference_ids]
             """,
             verbose=self._verbose
         )
         querydf = doi2id.as_dataframe()
+        try:
+            self.firstAuthor = doi2id.as_dataframe_authors().last_name.iloc[0]
+        except Exception:
+            raise
         if querydf.shape[0] > 0:
             self.pubids = querydf['id'].values[0]
             self.pubrefs = list(
@@ -252,10 +258,12 @@ def generateNetworkFiles(self, outfolder):
                 }
             )
         doiname = self.startDoi
+        firstauthor = self.firstAuthor
         for key, val in self.stringClean.items():
             doiname = re.sub(key, val, doiname)
+            firstauthor = re.sub(key, val, firstauthor)
 
-        outfile = os.path.join(outfolder, doiname + '.json')
+        outfile = os.path.join(outfolder, firstauthor + '_' + doiname + '.json')
         with open(outfile, 'w', encoding="utf8") as ofile:
-            json.dump(outformat, ofile, indent=4, ensure_ascii=True)
+            json.dump(outformat, ofile, ensure_ascii=True)
         return f'Finished querying extra metadata in {time.time() - starttime} seconds.'

From 1df9645daf939089f3b010fcce3e5f79389a76cc Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Mon, 21 Mar 2022 16:09:40 +0100
Subject: [PATCH 47/53] wip add time,filename output

---
 src/semanticlayertools/visual/citationnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py
index 69f3a9c..73253a0 100644
--- a/src/semanticlayertools/visual/citationnet.py
+++ b/src/semanticlayertools/visual/citationnet.py
@@ -263,7 +263,7 @@ def generateNetworkFiles(self, outfolder):
             doiname = re.sub(key, val, doiname)
             firstauthor = re.sub(key, val, firstauthor)
 
-        outfile = os.path.join(outfolder, firstauthor + '_' + doiname + '.json')
+        outfile = os.path.join(outfolder, f'{firstauthor}_{doiname}.json')
         with open(outfile, 'w', encoding="utf8") as ofile:
             json.dump(outformat, ofile, ensure_ascii=True)
-        return f'Finished querying extra metadata in {time.time() - starttime} seconds.'
+        return {time.time() - starttime}, f'{firstauthor}_{doiname}.json'

From 8ddb6ac17fdea2030a3a208cecda5c1bb58b6107 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Tue, 22 Mar 2022 12:34:17 +0100
Subject: [PATCH 48/53] catch exception of no author or empty df

---
 src/semanticlayertools/visual/citationnet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py
index 73253a0..c5ff17c 100644
--- a/src/semanticlayertools/visual/citationnet.py
+++ b/src/semanticlayertools/visual/citationnet.py
@@ -160,9 +160,9 @@ def query(self, startDoi=''):
         )
         querydf = doi2id.as_dataframe()
         try:
-            self.firstAuthor = doi2id.as_dataframe_authors().last_name.iloc[0]
-        except Exception:
-            raise
+            self.firstAuthor = doi2id.as_dataframe_authors()['last_name'].iloc[0]
+        except KeyError:
+            pass
         if querydf.shape[0] > 0:
             self.pubids = querydf['id'].values[0]
             self.pubrefs = list(
@@ -212,7 +212,7 @@ def query(self, startDoi=''):
             print(f'Finished queries in {time.time() - starttime} seconds.')
             return self
         else:
-            print('The requested DOI is cited to often.')
+            print('The requested DOI is either cited to often or not available in the dataset.')
 
     def returnLinks(self):
         return pd.concat(self.dataframeList)

From ab082024f1c3cdb24eac311658525f48f2a7229c Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Tue, 22 Mar 2022 12:36:54 +0100
Subject: [PATCH 49/53] chg return type

---
 src/semanticlayertools/visual/citationnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py
index c5ff17c..4ab4a1c 100644
--- a/src/semanticlayertools/visual/citationnet.py
+++ b/src/semanticlayertools/visual/citationnet.py
@@ -212,7 +212,7 @@ def query(self, startDoi=''):
             print(f'Finished queries in {time.time() - starttime} seconds.')
             return self
         else:
-            print('The requested DOI is either cited to often or not available in the dataset.')
+            return f'The requested DOI {startDoi} is either cited to often or not available in the dataset.'
 
     def returnLinks(self):
         return pd.concat(self.dataframeList)

From 15d96666bf23914cd0caa3688c68e77811b77229 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Tue, 22 Mar 2022 13:14:27 +0100
Subject: [PATCH 50/53] add option for citationlimit

---
 src/semanticlayertools/visual/citationnet.py  |   3 +-
 .../visual/generateCitationTree.py            | 705 ------------------
 2 files changed, 2 insertions(+), 706 deletions(-)
 delete mode 100644 src/semanticlayertools/visual/generateCitationTree.py

diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py
index 4ab4a1c..edf0aaf 100644
--- a/src/semanticlayertools/visual/citationnet.py
+++ b/src/semanticlayertools/visual/citationnet.py
@@ -144,8 +144,9 @@ def _getMissing(self, idlist):
         retDF.insert(0, 'title', cleantitle)
         return retDF[retCols]
 
-    def query(self, startDoi=''):
+    def query(self, startDoi='', citationLimit=100):
         self.startDoi = startDoi
+        self.citationLimit = citationLimit
         self.dataframeList = []
         starttime = time.time()
         doi2id = self.dsl.query(
diff --git a/src/semanticlayertools/visual/generateCitationTree.py b/src/semanticlayertools/visual/generateCitationTree.py
deleted file mode 100644
index 69c7b04..0000000
--- a/src/semanticlayertools/visual/generateCitationTree.py
+++ /dev/null
@@ -1,705 +0,0 @@
-"""
-generateCitationTree contains classes for generating citation trees for a single DOI.
-
-https://app.dimensions.ai is used in favor of CrossRef, as it contains richer information.
-Requires dimensions.ai API access.
-Results are compatible to the existing GenerateCitationNetwork module.
-"""
-import os
-import dimcli
-import pandas as pd
-import numpy as np
-import json
-from tqdm import tqdm
-import math
-from typing import Dict, Tuple, List
-import datetime
-import time
-import re
-from requests.exceptions import HTTPError
-
-# type aliases
-Doi = str
-PubID = str
-FilePath = str
-
-
-class generateTree:
-    """GenerateCitationNet makes citation/reference networks for a document.
-
-    For a given input document, its references and citations are evaluated. In
-    a second step, citations of citations and references of references are extracted.
-    This information is used to generate a tree like network.
-    """
-
-    def __init__(
-        self,
-        verbose: bool = False,
-        api_key="",
-        use_expanded_target_references: bool = False,
-    ):
-        """
-        __init__ instantiates citation network generator.
-
-        :param verbose: forwarded to dimcli queries, defaults to False
-        :type verbose: bool, optional
-        :param api_key: dimensions.ai API key, tries to use dsl.ini if not existent, defaults to ""
-        :type api_key: str, optional
-        :param use_expanded_target_references: whether or not to use indirect connections
-            (not through input node) to make network edges
-        :type use_expanded_target_references: bool, optional
-        """
-        while not dimcli.login_status():
-            try:
-                dimcli.login(key=api_key)
-            except HTTPError as e:
-                if e.response.status_code == 401:
-                    raise
-                time.sleep(5)
-                pass
-
-        self.dsl: dimcli.Dsl = dimcli.Dsl()
-        self._verbose: bool = verbose
-        self.startDoi: Doi = ""
-        self.stringClean = {r"\s": "__", "/": "_slash_", ":": "_colon_", r"\.": "_dot_"}
-        self._make_hairball = use_expanded_target_references
-
-    def fetchPubsByIDs(
-        self, pubIDs: List[PubID], authors: bool = True
-    ) -> Tuple[bool, pd.DataFrame]:
-        """
-        Fetch publications from dimcli using PubIDs defined by dimensions.ai.
-
-        :param pubIDs: list of PubIDs (string type alias)
-        :type pubIDs: List[PubID]
-        :param authors: whether to fetch author information, defaults to True
-        :type authors: bool, optional
-        :return: status-bool (True if everything is okay), dataframe containing
-            required information for input publications
-        :rtype: Tuple[bool, pd.DataFrame]
-        """
-        if self._verbose:
-            print(f"hi, this is fetchPubsByIDs() for pubs = {pubIDs}")
-
-        if authors:
-            query = f"""
-                    search publications
-                        where id in {json.dumps(pubIDs)}
-                    return publications[id+doi+title+category_for+year
-                        +reference_ids+authors+journal_title_raw+times_cited]
-                    limit {len(pubIDs)}
-                """
-        else:
-            query = f"""
-                    search publications
-                        where id in {json.dumps(pubIDs)}
-                    return publications[id+doi+title+category_for+year
-                        +reference_ids+journal_title_raw+times_cited]
-                    limit {len(pubIDs)}
-                """
-
-        dsl_data = self.dsl.query(query, verbose=self._verbose)
-
-        df = dsl_data.as_dataframe()
-
-        try:
-            df["target_refs"] = df["reference_ids"]
-        except (TypeError, KeyError):
-            return False, pd.DataFrame()
-
-        # replace NaN with empty list
-        df["target_refs"] = df["target_refs"].apply(
-            lambda target_ref: [] if type(target_ref) == float else target_ref
-        )
-
-        if not authors:
-            df["authors"] = [np.nan] * len(df)
-
-        return True, df
-
-    def fetchPubsByDois(
-        self, dois: List[Doi], authors: bool = True
-    ) -> Tuple[bool, pd.DataFrame]:
-        """
-        Fetch publications from dimcli using DOIs.
-
-        :param dois: list of DOIs (string type alias)
-        :type dois: List[Doi]
-        :param authors: whether to fetch author information, defaults to True
-        :type authors: bool, optional
-        :return: status-bool (True if everything is okay), dataframe containing
-            required information for input publications
-        :rtype: Tuple[bool, pd.DataFrame]
-        """
-        if self._verbose:
-            print(f"hi, this is fetchOrigin() for doi = {dois}")
-
-        if authors:
-            query = f"""
-                    search publications
-                        where doi in {json.dumps(dois)}
-                    return publications[id+doi+title+category_for+year
-                        +reference_ids+authors+journal_title_raw+times_cited]
-                """
-        else:
-            query = f"""
-                    search publications
-                        where doi in {json.dumps(dois)}
-                    return publications[id+doi+title+category_for+year
-                        +reference_ids+journal_title_raw+times_cited]
-                """
-
-        dsl_data = self.dsl.query(query, verbose=self._verbose)
-
-        df = dsl_data.as_dataframe()
-        try:
-            df["target_refs"] = df["reference_ids"]
-        except (TypeError, KeyError):
-            return False, pd.DataFrame()
-
-        # replace NaN with empty list
-        df["target_refs"] = df["target_refs"].apply(
-            lambda target_ref: [] if type(target_ref) == float else target_ref
-        )
-
-        if not authors:
-            df["authors"] = [np.nan] * len(df)
-
-        return True, df
-
-    def fetchCitations(
-        self, pubIDs: List[PubID], authors: bool = False
-    ) -> Tuple[bool, pd.DataFrame]:
-        """
-        Fetch citing publications for a list of publications using their PubIDs.
-
-        :param pubIDs: list of PubIDs (string type alias)
-        :type pubIDs: List[PubID]
-        :param authors: whether to fetch author information, defaults to False
-        :type authors: bool, optional
-        :return: status-bool (True if everything is okay), dataframe containing
-            required information for citing publications
-        :rtype: Tuple[bool, pd.DataFrame]
-        """
-        if self._verbose:
-            print(f"hi, this is fetchCitations() for pubs = {pubIDs}")
-
-        dfs = []
-        if math.ceil(len(pubIDs) / 512) > 1:
-            __range__ = tqdm(range(math.ceil(len(pubIDs) / 512)))
-        else:
-            __range__ = range(math.ceil(len(pubIDs) / 512))
-
-        for i in __range__:
-            # dimcli queries are limited to 512 entites per list for `in` filtering
-            offset = i * 512
-
-            if authors:
-                query = f"""
-                        search publications
-                            where reference_ids in {json.dumps(pubIDs[offset:offset+512])}
-                        return publications[id+doi+title+category_for+year
-                            +reference_ids+authors+journal_title_raw+times_cited]
-                    """
-            else:
-                query = f"""
-                        search publications
-                            where reference_ids in {json.dumps(pubIDs[offset:offset+512])}
-                        return publications[id+doi+title+category_for+year
-                            +reference_ids+journal_title_raw+times_cited]
-                    """
-
-            dsl_data = self.dsl.query_iterative(query, verbose=self._verbose)
-            tmp = dsl_data.as_dataframe()
-
-            try:
-                _ = tmp["reference_ids"]
-            except (TypeError, KeyError):
-                return False, pd.DataFrame()
-
-            dfs.append(tmp)
-
-        df = pd.concat(dfs)
-        # intersection of input pubIDs and references of each publication
-        df["target_refs"] = df["reference_ids"].apply(
-            lambda row_refs: list(set(pubIDs) & set(row_refs))
-        )
-
-        if not authors:
-            df["authors"] = [np.nan] * len(df)
-
-        return True, df
-
-    def fetchReferences(
-        self, pubIDs: List[PubID], authors: bool = True
-    ) -> Tuple[bool, pd.DataFrame]:
-        """
-        Fetch references for a list of publications using their PubIDs as defined by dimensions.ai.
-
-        :param pubIDs: list of PubIDs (string type alias)
-        :type pubIDs: List[PubID]
-        :param authors: whether to fetch author information, defaults to True
-        :type authors: bool, optional
-        :return: status-bool (True if everything is okay), dataframe containing
-            required information for references
-        :rtype: Tuple[bool, pd.DataFrame]
-        """
-        if self._verbose:
-            print(f"hi, this is fetchReferences() for pubs = {pubIDs}")
-
-        dfs = []
-
-        if math.ceil(len(pubIDs) / 512) > 1:
-            __range__ = tqdm(range(math.ceil(len(pubIDs) / 512)))
-        else:
-            __range__ = range(math.ceil(len(pubIDs) / 512))
-
-        # get references (PubID) of given PubIDs
-        for i in __range__:
-            # dimcli queries are limited to 512 entites per list for `in` filtering
-            offset = i * 512
-
-            query = f"""
-                    search publications
-                        where id in {json.dumps(pubIDs[offset:offset + 512])}
-                    return publications[id+reference_ids]
-                    limit 512
-                """
-
-            dsl_data = self.dsl.query(query, verbose=self._verbose)
-            tmp = dsl_data.as_dataframe()
-
-            try:
-                _ = tmp["reference_ids"]
-            except (TypeError, KeyError):
-                return False, pd.DataFrame()
-
-            dfs.append(tmp)
-
-        df0 = pd.concat(dfs)
-
-        # flatten list of references
-        # List[List[PubID]] -> List[PubID]
-        refs = [x for x in df0["reference_ids"].dropna().to_list() for x in x]
-
-        # drop duplicates
-        refs = list(set(refs))
-
-        dfs = []
-        if math.ceil(len(refs) / 512) > 1:
-            __range__ = tqdm(range(math.ceil(len(refs) / 512)))
-        else:
-            __range__ = range(math.ceil(len(refs) / 512))
-
-        for i in __range__:
-            # dimcli queries are limited to 512 entites per list for `in` filtering
-            offset = i * 512
-            ok, df = self.fetchPubsByIDs(refs[offset : offset + 512], authors=authors)
-            if ok:
-                dfs.append(df)
-            else:  # pragma: no cover
-                # cannot be reached unless dimensions database is malicious
-                return False, pd.DataFrame()
-
-        return True, pd.concat(dfs)
-
-    def run(
-        self, doi: Doi, levels_ref: int = 2, levels_cite: int = 2, authors: bool = False
-    ) -> Tuple[bool, pd.DataFrame]:
-        """
-        Generate citation network for a publication using its DOI.
-
-        :param doi: input DOI (string type alias)
-        :type doi: Drunoi
-        :param levels_ref: number of levels for references, defaults to 2
-        :type levels_ref: int, optional
-        :param levels_cite: number of levels for citing publications, defaults to 2
-        :type levels_cite: int, optional
-        :param authors: whether to include author information, defaults to False
-        :type authors: bool, optional
-        :return: status-bool (True if everything is okay), dataframe containing
-            required information for input publications, references and citing publications
-        :rtype: Tuple[bool, pd.DataFrame]
-        """
-        if hasattr(self, "result_df"):
-            return True, self.result_df
-
-        self.startDoi = doi
-
-        dfs = []
-
-        print("level 0")
-        ok, df_origin = self.fetchPubsByDois([doi], authors)
-        if not ok:
-            print(f"could not fetch publication for DOI {doi}")
-            return False, pd.DataFrame()
-
-        df_origin["level"] = 0
-        dfs.append(df_origin)
-
-        ok, dfs_cite = self._fetchCite(df_origin, levels_cite, authors)
-        ok, dfs_ref = self._fetchRef(df_origin, levels_ref, authors)
-
-        dfs.extend(dfs_cite + dfs_ref)
-
-        self.result_df: pd.DataFrame = pd.concat(dfs).reset_index(drop=True)
-
-        # cleaning
-        # self.result_df = self.dropDuplicates(self.result_df)
-        self.result_df["first_author"] = self.result_df["authors"].apply(
-            lambda authors: authors[0]["last_name"] if type(authors) == list else ""
-        )
-        self.result_df["ref_count"] = self.result_df["reference_ids"].apply(
-            lambda refs: len(refs) if type(refs) == list else None
-        )
-        self.result_df.index = self.result_df["id"]
-
-        self.main_node = df_origin.iloc[0].copy()
-        if type(self.main_node["authors"]) == list:
-            self.main_node["first_author"] = self.main_node["authors"][0]["last_name"]
-        else:
-            self.main_node["first_author"] = ""
-
-        self.result_df["main_category_for"] = self.result_df["category_for"].apply(
-            lambda c: [
-                x["name"]
-                for x in filter(lambda dict_: re.match(r"^\d\d\s", dict_["name"]), c)
-            ][0]
-            if type(c) == list
-            else ""
-        )
-
-        # replace NaN in reference_ids with empty list
-        self.result_df["reference_ids"] = self.result_df["reference_ids"].apply(
-            lambda target_ref: [] if type(target_ref) == float else target_ref
-        )
-
-        # include expanded target refs
-        # (intersection of references of a paper and all listed publications,
-        #  e.g. main_node cites A, Y cites main_node and A
-        #    -> target_refs does not contain connection from Y to A)
-        all_pubs = set(self.result_df.index)
-        self.result_df["expanded_target_refs"] = self.result_df["reference_ids"].apply(
-            lambda reference_ids: list(all_pubs.intersection(reference_ids))
-        )
-
-        return True, self.result_df
-
-    def _fetchCite(
-        self, df_origin: pd.DataFrame, levels: int, authors: bool
-    ) -> Tuple[bool, List[pd.DataFrame]]:
-        dfs_cite = []
-
-        pubIDs = df_origin["id"].to_list()
-        for i in range(levels):
-            print(f"level {i + 1}, fetching citations for {len(pubIDs)} publications")
-            ok, tmp = self.fetchCitations(pubIDs, authors)
-            if ok:
-                pubIDs = tmp["id"].to_list()
-                tmp["level"] = i + 1
-                dfs_cite.append(tmp)
-            else:  # pragma: no cover
-                # cannot be reached unless dimensions database is malicious
-                return False, pd.DataFrame()
-        return True, dfs_cite
-
-    def _fetchRef(
-        self, df_origin: pd.DataFrame, levels: int, authors: bool
-    ) -> Tuple[bool, List[pd.DataFrame]]:
-        dfs_ref = []
-        pubIDs = df_origin["id"].to_list()
-        for i in range(levels):
-            print(
-                f"level {(i + 1) * (-1)}, fetching references for {len(pubIDs)} publications"
-            )
-            ok, tmp = self.fetchReferences(pubIDs, authors)
-            if ok:
-                pubIDs = tmp["id"].to_list()
-                tmp["level"] = (i + 1) * (-1)
-                dfs_ref.append(tmp)
-            else:  # pragma: no cover
-                # cannot be reached unless dimensions database is malicious
-                return False, pd.DataFrame()
-        return True, dfs_ref
-
-    def _makeCompatibleRefDf(
-        self, df: pd.DataFrame, use_expanded: bool = False
-    ) -> pd.DataFrame:
-        """
-        Reformat references dataframe to match prior versions formatting.
-
-        :param df: dataframe as generated by .fetchReferences()
-        :type df: pd.DataFrame
-        :return: compatible dataframe
-        :rtype: pd.DataFrame
-        """
-        levels_ref = min(df["level"])
-
-        # flatten references
-        if use_expanded:
-            target_ref_type = "expanded_target_refs"
-        else:
-            target_ref_type = "target_refs"
-
-        ref_tuples = {
-            (row["id"], ref)
-            for _, row in df.query(f"{levels_ref} < level <= 0").iterrows()
-            for ref in row[target_ref_type]
-        }
-
-        df = df[~df.index.duplicated(keep="first")]
-
-        refs = []
-        for (source_id, target_id) in ref_tuples:
-            source = df.loc[source_id]
-            target = df.loc[target_id]
-
-            refs.append(
-                {
-                    "type": "reference",
-                    "sourceYear": source["year"],
-                    "sourceDOI": source["doi"],
-                    "sourcePubID": source["id"],
-                    "sourceJournal": source["journal_title_raw"],
-                    "targetFull": "",
-                    "targetYear": target["year"],
-                    "targetDOI": target["doi"],
-                    "targetPubID": target["id"],
-                    "targetrefCount": target["ref_count"],
-                    "targetis_ref_byCount": target["times_cited"],
-                    "targettitleStr": target["title"],
-                    "targetFirstAuthor": target["first_author"],
-                    "targetJournal": target["journal_title_raw"],
-                    "targetSubject": target["category_for"],
-                }
-            )
-
-        return pd.DataFrame(refs)
-
-    def _makeCompatibleCiteDf(
-        self, df: pd.DataFrame, use_expanded: bool = False
-    ) -> pd.DataFrame:
-        """
-        Reformat citation dataframe to match prior versions formatting.
-
-        :param df: dataframe as generated by .fetchCitations()
-        :type df: pd.DataFrame
-        :return: compatible dataframe
-        :rtype: pd.DataFrame
-        """
-        levels_cite = max(df["level"])
-
-        # flatten citations
-        if use_expanded:
-            target_ref_type = "expanded_target_refs"
-        else:
-            target_ref_type = "target_refs"
-
-        cite_tuples = {
-            (row["id"], ref)
-            for _, row in df.query(f"{levels_cite} >= level > 0").iterrows()
-            for ref in row[target_ref_type]
-        }
-
-        df = df[~df.index.duplicated(keep="first")]
-
-        cites = []
-        for (source_id, target_id) in cite_tuples:
-            source = df.loc[source_id]
-            target = df.loc[target_id]
-
-            cites.append(
-                {
-                    "type": "citation",
-                    "targetPubID": target["id"],
-                    "targetYear": target["year"],
-                    "targetDOI": target["doi"],
-                    "targetJournal": target["journal_title_raw"],
-                    "sourceYear": source["year"],
-                    "sourceDOI": source["doi"],
-                    "sourcePubID": source["id"],
-                    "sourcerefCount": source["ref_count"],
-                    "sourceis_ref_byCount": source["times_cited"],
-                    "sourcetitleStr": source["title"],
-                    "sourceFirstAuthor": source["first_author"],
-                    "sourceJournal": source["journal_title_raw"],
-                    "sourceSubject": source["category_for"],
-                }
-            )
-
-        return pd.DataFrame(cites)
-
-    def makeCompatibleDf(self) -> Tuple[bool, pd.DataFrame]:
-        """
-        Reformat dataframe to match prior versions formatting.
-
-        :param df: dataframe as generated by .run()
-        :type df: pd.DataFrame
-        :return: compatible dataframe
-        :rtype: pd.DataFrame
-        """
-        if not hasattr(self, "result_df"):
-            print("you gotta run .run() first")
-            return False, pd.DataFrame()
-
-        if hasattr(self, "compatible_result_df"):
-            return True, self.compatible_result_df
-
-        df_ref = self._makeCompatibleRefDf(self.result_df, self._make_hairball)
-        df_cite = self._makeCompatibleCiteDf(self.result_df, self._make_hairball)
-
-        self.compatible_result_df: pd.DataFrame = pd.concat(
-            [df_cite, df_ref], ignore_index=True
-        ).fillna("")
-
-        return True, self.compatible_result_df
-
-    def runCompatible(
-        self,
-        doi: Doi,
-        level: int = 2,
-        direct: str = "both",
-        debug: bool = False,
-    ) -> Tuple[bool, pd.DataFrame]:
-        """
-        Wrap .run() with same parameters and outputs as prior versions.
-
-        :param doi: input DOI (string type alias)
-        :type doi: Doi
-        :param level: number of levels to fetch, defaults to 2
-        :type level: int, optional
-        :param direct: direction of search (either "ref", "cite" or "both"), defaults to "both"
-        :type direct: str, optional
-        :param debug: [description], defaults to False
-        :type debug: bool, optional
-        :return: [description]
-        :rtype: Tuple[bool, pd.DataFrame]
-        """
-        if direct == "ref":
-            ok, df = self.run(doi, levels_ref=level, levels_cite=0)
-        elif direct == "cite":
-            ok, df = self.run(doi, levels_ref=0, levels_cite=level)
-        elif direct == "both":
-            ok, df = self.run(doi, levels_ref=level, levels_cite=level)
-        else:
-            print("provide proper direction of search (either `ref`, `cite` or `both`)")
-            return False, pd.DataFrame()
-
-        if not ok:
-            return False, pd.DataFrame()
-
-        ok, comp_df = self.makeCompatibleDf()
-        if ok:
-            return True, comp_df
-        else:
-            return False, pd.DataFrame()
-
-    def _nodeDict(self, row: pd.Series) -> Dict:
-        # row = row.fillna("")
-
-        if row["doi"].lower() == self.startDoi.lower():
-            inputDOI = "True"
-        else:
-            inputDOI = "False"
-        res = {
-            #   "label": nodeName,
-            #   "x": 0,
-            #   "y": 0,
-            "id": row["id"],
-            "attributes": {
-                # "name": nodeName,
-                "title": row["title"],
-                "doi": row["doi"],
-                "nodeyear": row["year"],
-                "ref-by-count": row["times_cited"],
-                "is_input_DOI": inputDOI,
-                "category_for": row["main_category_for"],
-                "level": row["level"],
-            },
-            #   "color": "rgb(0,0,0)",
-            #   "size": 10
-        }
-        return res
-
-    def _edgeDict(self, row: pd.Series) -> Dict:
-        # row = row.fillna("")
-
-        res = {
-            "source": row["sourcePubID"],
-            "target": row["targetPubID"],
-            #   "id": idx,
-            "attributes": {"year": row["sourceYear"], "type": row["type"]},
-            #   "color": "rgb(0,0,0)",
-            #   "size": 1
-        }
-        return res
-
-    def _createFilename(self, ext: str = "json") -> FilePath:
-        filename = self.startDoi
-        date = datetime.datetime.now().strftime("%Y-%m-%d")
-        for key, val in self.stringClean.items():
-            filename = re.sub(key, val, filename)
-        if self._make_hairball:
-            path = f"{self.main_node['first_author']}_{filename}_date_{date}_hairball.{ext}"
-        else:
-            path = f"{self.main_node['first_author']}_{filename}_date_{date}.{ext}"
-        return path
-
-    def createJSON(self, outputPath: FilePath = "./out") -> Tuple[bool, FilePath]:
-        """
-        Create JSON file on disk containing network as lists of nodes and edges for visualization.
-
-        :param outputPath: output directory, defaults to "./out"
-        :type outputPath: FilePath, optional
-        :return: status-bool (True if everything is okay), path of JSON file
-        :rtype: Tuple[bool, FilePath]
-        """
-        if not hasattr(self, "result_df"):
-            print("You need to use .run() first to create some data to write.")
-            return False, ""
-
-        if not hasattr(self, "compatible_result_df"):
-            self.makeCompatibleDf()
-
-        allNodes = [
-            x
-            for _, x in self.result_df[~self.result_df.index.duplicated()]
-            .fillna("")
-            .iterrows()
-        ]
-        allRows = [x for x in self.compatible_result_df.fillna("").iterrows()]
-
-        outputPath = os.path.abspath(outputPath)
-        if not os.path.exists(outputPath):
-            os.mkdir(outputPath)
-
-        with open(f"{outputPath}/{self._createFilename()}", "w") as outFile:
-            # write nodes
-            outFile.write('{\n  "nodes": [\n')
-
-            # write nodes from compatible_result_df/allNodes
-            while allNodes:
-                node = allNodes.pop()
-                if len(allNodes) == 0:
-                    outFile.write(json.dumps(self._nodeDict(node)) + "\n")
-                else:
-                    outFile.write(json.dumps(self._nodeDict(node)) + ",\n")
-
-            # write edges
-            outFile.write('  ],\n  "edges":[')
-            while allRows:
-                idx, edge = allRows.pop()
-                if len(allRows) == 0:
-                    outFile.write(json.dumps(self._edgeDict(edge)) + "\n")
-                else:
-                    x = self._edgeDict(edge)
-                    outFile.write(json.dumps(x) + ",\n")
-            outFile.write("  ]\n}")
-
-        return True, f"{outputPath}/{self._createFilename()}"
-
-    def logout(self) -> None:
-        """
-        Dimcli logout.
-        """
-        dimcli.logout()

From 77c76a4a7e5f5e24c93f0002a370c4763f17b0cd Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Tue, 22 Mar 2022 13:36:59 +0100
Subject: [PATCH 51/53] return more informative feedback vals

---
 src/semanticlayertools/visual/citationnet.py | 103 ++++++++++---------
 1 file changed, 52 insertions(+), 51 deletions(-)

diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py
index edf0aaf..dda4cd4 100644
--- a/src/semanticlayertools/visual/citationnet.py
+++ b/src/semanticlayertools/visual/citationnet.py
@@ -153,67 +153,68 @@ def query(self, startDoi='', citationLimit=100):
             f"""search
                   publications
                 where
-                  doi = "{startDoi}" and times_cited <= {self.citationLimit}
+                  doi = "{startDoi}"
                 return
                   publications[id+authors+doi+times_cited+category_for+title+year+reference_ids]
             """,
             verbose=self._verbose
         )
         querydf = doi2id.as_dataframe()
+        if querydf.shape[0] == 0:
+            return f"The dataset contains no entry for {startDoi}."
+        elif querydf['times_cited'].iloc[0] >= self.citationLimit:
+            return f"{startDoi} is cited {querydf['times_cited'].iloc[0]} times. You can try to change the limit, if possible."
         try:
             self.firstAuthor = doi2id.as_dataframe_authors()['last_name'].iloc[0]
         except KeyError:
             pass
-        if querydf.shape[0] > 0:
-            self.pubids = querydf['id'].values[0]
-            self.pubrefs = list(
-                [x for y in querydf['reference_ids'].values for x in y]
-            )
-            self.dataframeList.append(
-                self._editDF(querydf, dftype="ref_l1")
-            )
-            ref1trgtList = list(self.dataframeList[0].target.values)
-            cit1df = self.dsl.query_iterative(
-                f"""search
-                      publications
-                    where
-                      reference_ids = "{self.pubids}"
-                    return
-                      publications[id+doi+times_cited+category_for+title+year+reference_ids]
-                """,
-                verbose=self._verbose)
-            self.dataframeList.append(
-                self._editDF(cit1df.as_dataframe(), dftype='cite_l1')
-            )
-            cit1SrcList = list(self.dataframeList[1].source.values)
-            cit2df = self.dsl.query_iterative(
-                f"""search
-                      publications
-                    where
-                      reference_ids in {json.dumps(cit1SrcList)}
-                    return
-                      publications[id+doi+times_cited+category_for+title+year+reference_ids]""",
-                verbose=self._verbose
-            )
-            self.dataframeList.append(
-                self._editDF(cit2df.as_dataframe(), dftype='cite_l2', level2List=cit1SrcList)
-            )
-            ref2df = self.dsl.query_iterative(
-                f"""search
-                      publications
-                    where
-                      id in {json.dumps(ref1trgtList)}
-                    return
-                      publications[id+doi+times_cited+category_for+title+year+reference_ids]""",
-                verbose=self._verbose
-            )
-            self.dataframeList.append(
-                self._editDF(ref2df.as_dataframe(), dftype='ref_l2')
-            )
-            print(f'Finished queries in {time.time() - starttime} seconds.')
-            return self
-        else:
-            return f'The requested DOI {startDoi} is either cited to often or not available in the dataset.'
+        self.pubids = querydf['id'].values[0]
+        self.pubrefs = list(
+            [x for y in querydf['reference_ids'].values for x in y]
+        )
+        self.dataframeList.append(
+            self._editDF(querydf, dftype="ref_l1")
+        )
+        ref1trgtList = list(self.dataframeList[0].target.values)
+        cit1df = self.dsl.query_iterative(
+            f"""search
+                  publications
+                where
+                  reference_ids = "{self.pubids}"
+                return
+                  publications[id+doi+times_cited+category_for+title+year+reference_ids]
+            """,
+            verbose=self._verbose)
+        self.dataframeList.append(
+            self._editDF(cit1df.as_dataframe(), dftype='cite_l1')
+        )
+        cit1SrcList = list(self.dataframeList[1].source.values)
+        cit2df = self.dsl.query_iterative(
+            f"""search
+                  publications
+                where
+                  reference_ids in {json.dumps(cit1SrcList)}
+                return
+                  publications[id+doi+times_cited+category_for+title+year+reference_ids]""",
+            verbose=self._verbose
+        )
+        self.dataframeList.append(
+            self._editDF(cit2df.as_dataframe(), dftype='cite_l2', level2List=cit1SrcList)
+        )
+        ref2df = self.dsl.query_iterative(
+            f"""search
+                  publications
+                where
+                  id in {json.dumps(ref1trgtList)}
+                return
+                  publications[id+doi+times_cited+category_for+title+year+reference_ids]""",
+            verbose=self._verbose
+        )
+        self.dataframeList.append(
+            self._editDF(ref2df.as_dataframe(), dftype='ref_l2')
+        )
+        print(f'Finished queries in {time.time() - starttime} seconds.')
+        return self
 
     def returnLinks(self):
         return pd.concat(self.dataframeList)

From f940b72e60feeedb183b682f9867c5af46879e25 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Tue, 22 Mar 2022 14:48:21 +0100
Subject: [PATCH 52/53] chg output format of duration

---
 src/semanticlayertools/visual/citationnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py
index dda4cd4..61134be 100644
--- a/src/semanticlayertools/visual/citationnet.py
+++ b/src/semanticlayertools/visual/citationnet.py
@@ -268,4 +268,4 @@ def generateNetworkFiles(self, outfolder):
         outfile = os.path.join(outfolder, f'{firstauthor}_{doiname}.json')
         with open(outfile, 'w', encoding="utf8") as ofile:
             json.dump(outformat, ofile, ensure_ascii=True)
-        return {time.time() - starttime}, f'{firstauthor}_{doiname}.json'
+        return time.time() - starttime, f'{firstauthor}_{doiname}.json'

From 8c63199baab4f94045024bc3c9e83f15f0a0eed3 Mon Sep 17 00:00:00 2001
From: Malte Vogl <mvogl@mpiwg-berlin.mpg.de>
Date: Thu, 24 Mar 2022 10:03:46 +0100
Subject: [PATCH 53/53] wip fix doc for visual

---
 docs/visual.rst                              | 3 ++-
 src/semanticlayertools/visual/citationnet.py | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/visual.rst b/docs/visual.rst
index 09e7e89..8fba857 100644
--- a/docs/visual.rst
+++ b/docs/visual.rst
@@ -68,6 +68,7 @@ references as well as its citations and their citations. With this means,
 visualizations of it show academic roots and conduits and can display
 disciplinary pathways.
 
-.. automodule:: semanticlayertools.visual.generateCitationTree
+.. automodule:: semanticlayertools.visual.citationnet
   :members:
+  :private-members:
   :undoc-members:
diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py
index 61134be..d0c7926 100644
--- a/src/semanticlayertools/visual/citationnet.py
+++ b/src/semanticlayertools/visual/citationnet.py
@@ -47,7 +47,7 @@ def __init__(self, verbose: bool = False, api_key=""):
     def _cleanTitleString(self, row):
         """Clean non-JSON characters from titles.
 
-        Removes newline characters and double backslashes.
+        Removes newline characters, double backslashes and quoted '"'.
         """
         try:
             title = row
@@ -217,9 +217,11 @@ def query(self, startDoi='', citationLimit=100):
         return self
 
     def returnLinks(self):
+        """Return all links as dataframe."""
         return pd.concat(self.dataframeList)
 
     def generateNetworkFiles(self, outfolder):
+        """Generates JSON with nodes and edges lists."""
         starttime = time.time()
         outformat = {'nodes': [], 'edges': []}
         dflinks = pd.concat(self.dataframeList)