From 219278da538b7973f469e87c0a15165a73a8f522 Mon Sep 17 00:00:00 2001 From: "malte.vogl" Date: Fri, 24 Sep 2021 07:42:45 +0000 Subject: [PATCH 01/53] Initial commit --- README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..df7a75b --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# SemanticLayerTools + +Collects tools to create semantic layers in the socio-epistemic networks framework. Source material can be any structured corpus with metadata of authors, time, and at least one text column. \ No newline at end of file From 53173d65d4682142b5a034cba778bebf9f251811 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Fri, 24 Sep 2021 14:55:46 +0200 Subject: [PATCH 02/53] init semantic layer tools package --- .gitignore | 4 + LICENSE | 21 ++++ docs/Makefile | 20 ++++ docs/conf.py | 55 +++++++++++ docs/index.rst | 22 +++++ docs/make.bat | 35 +++++++ pyproject.toml | 6 ++ setup.cfg | 29 ++++++ src/semanticlayertools/__init__.py | 0 src/semanticlayertools/cleaning/__init__.py | 0 src/semanticlayertools/linkage/__init__.py | 0 src/semanticlayertools/linkage/wordscore.py | 104 ++++++++++++++++++++ 12 files changed, 296 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 100644 docs/index.rst create mode 100644 docs/make.bat create mode 100644 pyproject.toml create mode 100644 setup.cfg create mode 100644 src/semanticlayertools/__init__.py create mode 100644 src/semanticlayertools/cleaning/__init__.py create mode 100644 src/semanticlayertools/linkage/__init__.py create mode 100644 src/semanticlayertools/linkage/wordscore.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba732dc --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +**build +**dist +*env +**.egg-info diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b2f6552 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Malte Vogl (ModelSEN project) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..372d312 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,55 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'SemanticLayerTools' +copyright = '2021, Malte Vogl' +author = 'Malte Vogl' + +# The full version, including alpha/beta/rc tags +release = '0.0.1' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..e57bcff --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,22 @@ +.. SemanticLayerTools documentation master file, created by + sphinx-quickstart on Fri Sep 24 14:43:14 2021. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to SemanticLayerTools's documentation! +============================================== + +This project collects tools to build semantic layers from text corpora. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..8084272 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..374b58c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..2113f09 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,29 @@ +[metadata] +name = semanticlayertools +version = 0.0.1 +author = Malte Vogl +author_email = mvogl@mpiwg-berlin.mpg.de +description = Create semantic layers using different methods for word linking. +long_description = file: README.md +long_description_content_type = text/markdown +url = https://gitlab.gwdg.de/modelsen/semanticlayertools +project_urls = + Project Home = https://modelsen.mpiwg-berlin.mpg.de + Bug Tracker = https://gitlab.gwdg.de/modelsen/semanticlayertools/-/issues +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent + +[options] +package_dir = + = src +packages = find: +python_requires = >=3.6 +install_requires = + tqdm + nltk + numpy + +[options.packages.find] +where = src diff --git a/src/semanticlayertools/__init__.py b/src/semanticlayertools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/semanticlayertools/cleaning/__init__.py b/src/semanticlayertools/cleaning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/semanticlayertools/linkage/__init__.py b/src/semanticlayertools/linkage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py new file mode 100644 index 0000000..7ce5d67 --- /dev/null +++ b/src/semanticlayertools/linkage/wordscore.py @@ -0,0 +1,104 @@ +import os +import re +from collections import Counter, defaultdict + +from tqdm import tqdm +import numpy as np +import nltk + +try: + nltk.pos_tag(nltk.word_tokenize('This is a test sentence.')) +except LookupError: + print('Installing nltk perceptron tagger.') + nltk.download('averaged_perceptron_tagger') + + +class CalculateScores(object): + """Calculates ngram scores for documents. + + Considered parts of speech are (see NLTK docs for details) + - Nouns: 'NN', 'NNS', 'NNP', 'NNPS' + - Adjectives: 'JJ', 'JJR', 'JJS' + """ + + def __init__(self, sourceDataframe, textCol="text", pubIDCol="pubID", ngramsize=5,): + + self.baseDF = sourceDataframe + self.textCol = textCol + self.pubIDCol = pubIDCol + self.ngramEnd = ngramsize + self.outputDict = {} + self.allNGrams = [] + self.counts = {} + self.allgramslist = [] + self.uniqueNGrams = () + + def getTermPatterns(self): + """Create dictionaries of occuring ngrams.""" + allNGrams = {x: [] for x in range(1, self.ngramEnd + 1, 1)} + pos_tag = ["NN", "NNS", "NNP", "NNPS", "JJ", "JJR", "JJS"] + for _, row in tqdm(self.baseDF.iterrows()): + tokens = nltk.word_tokenize(row[self.textCol]) + pos = nltk.pos_tag(tokens) + nnJJtokens = [x[0].lower() for x in pos if x[1] in pos_tag] + tempNGram = [] + for i in range(1, self.ngramEnd + 1, 1): + val = allNGrams[i] + newngrams = list(nltk.ngrams(nnJJtokens, i)) + val.extend(newngrams) + tempNGram.extend(newngrams) + allNGrams.update({i: val}) + self.outputDict[row[self.pubIDCol]] = tempNGram + self.allNGrams = allNGrams + allgrams = [x for y in [y for x, y in self.allNGrams.items()] for x in y] + self.allgramslist = allgrams + self.counts = Counter(allgrams) + self.uniqueNGrams = set(allgrams) + + def getScore(self, target): + """Calculate ngram score.""" + meta = { + "target": target, + "counts": self.counts[target], + "corpusL": len(self.allgramslist), + "maxL": len(target), + } + + res = defaultdict(list()) + + for idx, subgram in enumerate(target): + key = idx + 1 + for tup in self.allNGrams[2]: + if tup[1:][0] == subgram: + res[f"l_{key}"].append(tup[:1][0]) + elif tup[:-1][0] == subgram: + res[f"r_{key}"].append(tup[1:][0]) + valueList = [] + for L in range(1, meta["maxL"] + 1, 1): + leftkey = f"l_{L}" + rightkey = f"r_{L}" + if rightkey not in res.keys(): + rvalue = 0 + else: + rvalue = len(list(set(res[rightkey]))) + if leftkey not in res.keys(): + lvalue = 0 + else: + lvalue = len(list(set(res[leftkey]))) + valueList.append((lvalue + 1) * (rvalue + 1)) + return { + target: meta["counts"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"])) + } + + def run(self): + """Get score for all documents.""" + scores = {} + self.getTermPatterns() + for target in tqdm(self.uniqueNGrams): + scores.update(self.getScore(target)) + for key, val in self.outputDict.items(): + tmpList = [] + for elem in val: + tmpList.append([elem, scores[elem]]) + self.outputDict.update({key: tmpList}) + return scores, self.outputDict From 258d8399c94f199ee8a0e58c21df5fce0eb217a5 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 27 Sep 2021 16:02:58 +0200 Subject: [PATCH 03/53] wip continue word scoring linkage --- src/semanticlayertools/cleaning/clean.py | 0 src/semanticlayertools/linkage/wordscore.py | 184 +++++++++++++++++++- 2 files changed, 178 insertions(+), 6 deletions(-) create mode 100644 src/semanticlayertools/cleaning/clean.py diff --git a/src/semanticlayertools/cleaning/clean.py b/src/semanticlayertools/cleaning/clean.py new file mode 100644 index 0000000..e69de29 diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py index 7ce5d67..b7239e4 100644 --- a/src/semanticlayertools/linkage/wordscore.py +++ b/src/semanticlayertools/linkage/wordscore.py @@ -1,9 +1,12 @@ import os import re from collections import Counter, defaultdict +from itertools import islice, combinations + from tqdm import tqdm import numpy as np +import pandas as pd import nltk try: @@ -13,7 +16,7 @@ nltk.download('averaged_perceptron_tagger') -class CalculateScores(object): +class CalculateScores(): """Calculates ngram scores for documents. Considered parts of speech are (see NLTK docs for details) @@ -21,11 +24,12 @@ class CalculateScores(object): - Adjectives: 'JJ', 'JJR', 'JJS' """ - def __init__(self, sourceDataframe, textCol="text", pubIDCol="pubID", ngramsize=5,): + def __init__(self, sourceDataframe, textColumn="text", pubIDColumn="pubID", yearColumn='year', ngramsize=5,): self.baseDF = sourceDataframe - self.textCol = textCol - self.pubIDCol = pubIDCol + self.textCol = textColumn + self.pubIDCol = pubIDColumn + self.yearCol = yearColumn self.ngramEnd = ngramsize self.outputDict = {} self.allNGrams = [] @@ -87,10 +91,10 @@ def getScore(self, target): lvalue = len(list(set(res[leftkey]))) valueList.append((lvalue + 1) * (rvalue + 1)) return { - target: meta["counts"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"])) + target: meta["counts"]/meta["corpusL"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"])) } - def run(self): + def run(self, write=False, outpath='./'): """Get score for all documents.""" scores = {} self.getTermPatterns() @@ -101,4 +105,172 @@ def run(self): for elem in val: tmpList.append([elem, scores[elem]]) self.outputDict.update({key: tmpList}) + if write is True: + for year, df in self.baseDF.groupby(self.yearCol): + with open(f'{outpath}{str(year)}.csv', 'a') as yearfile: + for pub in df[self.pubIDCol].unique(): + for elem in self.outputDict[pub]: + yearfile.write(f'{pub},{elem[0]},{elem[1]}') return scores, self.outputDict + + +class LinksOverTime(): + """To keep track of nodes over time, we need a global register of node names. + + Input: + """ + + def __init__(self, outputPath, scorePath, dataframe, scoreLimit=1.0, debug=False, windowSize=1): + self.dataframe = dataframe + self.authorCol = 'author' + self.pubIDCol = 'pubIDelm' + self.scoreLimit = scoreLimit + self.outpath = outputPath + self.scorepath = scorePath + self.nodeMap = {} + self.debug = debug + self.windowSize = windowSize + + def _window(self, seq): + """Return a sliding window (of width n) over data from the iterable. + + s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... + """ + n = self.windowSize + it = iter(seq) + result = tuple(islice(it, n)) + if len(result) == n: + yield result + for elem in it: + result = result[1:] + (elem,) + yield result + + def _createSlices(self): + slices = [] + years = sorted(self.dataframe.year.unique()) + for x in self._window(years): + slices.append(x) + return slices + + def createNodeRegister(self, sl): + """Create multilayer node register for time slice.""" + if self.debug is True: + print(f'Slice: {sl[0]}') + dataframe = self.dataframe[self.dataframe.year.isin(sl)] + dfNgramsList = [pd.read_csv( + self.scorepath + str(slN) + '.tsv', + sep='\t', + header=None + ) for slN in sl] + ngramdataframe = pd.concat(dfNgramsList) + ngramdataframe = ngramdataframe[ngramdataframe[2] > self.scoreLimit] + + authorList = [x for y in dataframe[self.authorCol].values for x in y] + + authors = [x for x in set(authorList) if x] + pubs = dataframe[self.pubIDCol].fillna('None').unique() + ngrams = ngramdataframe[1].unique() + + for authorval in authors: + if not self.nodeMap.values(): + self.nodeMap.update({authorval: 1}) + else: + if authorval not in self.nodeMap.keys(): + self.nodeMap.update( + {authorval: max(self.nodeMap.values()) + 1} + ) + for pubval in list(pubs): + if pubval not in self.nodeMap.keys(): + self.nodeMap.update({pubval: max(self.nodeMap.values()) + 1}) + for ngramval in list(ngrams): + if ngramval not in self.nodeMap.keys(): + self.nodeMap.update({ngramval: max(self.nodeMap.values()) + 1}) + + if self.debug is True: + print( + '\tNumber of vertices (authors, papers and ngrams) {0}'.format( + max(self.nodeMap.values()) + ) + ) + + def writeLinks(self, sl, recreate=False): + """Write links to file.""" + dataframe = self.dataframe[self.dataframe.year.isin(sl)] + filePath = self.outpath + 'multilayerPajek_{0}.net'.format(sl[0]) + + if os.path.isfile(filePath): + if recreate is False: + raise IOError( + f'File at {filePath} exists. Set recreate = True to rewrite file.' + ) + if recreate is True: + os.remove(filePath) + + dfNgramsList = [pd.read_csv( + self.scorepath + str(slN) + '.tsv', + sep='\t', + header=None + ) for slN in sl] + ngramdataframe = pd.concat(dfNgramsList) + ngramdataframe = ngramdataframe[ngramdataframe[2] > self.scoreLimit] + + with open(filePath, 'a') as file: + file.write("# A network in a general multiplex format\n") + file.write("*Vertices {0}\n".format(max(self.nodeMap.values()))) + for x, y in self.nodeMap.items(): + tmpStr = '{0} "{1}"\n'.format(y, x) + if tmpStr: + file.write(tmpStr) + file.write("*Multiplex\n") + file.write("# layer node layer node [weight]\n") + if self.debug is True: + print('\tWriting inter-layer links to file.') + for _, row in dataframe.fillna('').iterrows(): + authors = row[self.authorCol] + paper = row[self.pubIDCol] + if paper not in self.nodeMap.keys(): + print(f'Cannot find {paper}') + ngramsList = ngramdataframe[ngramdataframe[0] == paper] + paperNr = self.nodeMap[paper] + if len(authors) >= 2: + # pairs = [x for x in combinations(authors, 2)] + for pair in combinations(authors, 2): # pairs: + file.write('{0} {1} {2} {3} 1\n'.format( + 1, + self.nodeMap[pair[0]], + 1, + self.nodeMap[pair[1]] + ) + ) + for author in authors: + try: + authNr = self.nodeMap[author] + file.write('{0} {1} {2} {3} 1\n'.format( + 1, + authNr, + 2, + paperNr + ) + ) + except KeyError: + pass + for _, ngramrow in ngramsList.iterrows(): + try: + ngramNr = self.nodeMap[ngramrow[1]] + weight = ngramrow[2] + file.write('{0} {1} {2} {3} {4}\n'.format( + 2, + paperNr, + 3, + ngramNr, + weight + ) + ) + except KeyError: + pass + + def run(self, recreate=False): + """Create all data for slices.""" + for sl in tqdm(self._createSlices()): + self.createNodeRegister(sl) + self.writeLinks(sl, recreate=recreate) From 1f05a8e3c335f239ea06941f9626072c44228a77 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Tue, 28 Sep 2021 15:03:41 +0200 Subject: [PATCH 04/53] wip add tests and docs with tox --- .gitignore | 3 ++ docs/cleaning.rst | 6 ++++ docs/conf.py | 6 +++- docs/index.rst | 3 ++ docs/linkage.rst | 6 ++++ requirements.txt | 5 +++ requirements_dev.txt | 1 + setup.cfg | 4 ++- src/semanticlayertools/cleaning/text.py | 36 +++++++++++++++++++ src/semanticlayertools/linkage/wordscore.py | 18 +++++----- .../cleaning/clean.py => tests/__init__.py | 0 tests/cleaning/__init__.py | 0 tests/cleaning/test_textcleaning.py | 15 ++++++++ tests/linkage/__init__.py | 0 tests/linkage/test_wordscore.py | 19 ++++++++++ tests/testdata/testdata.json | 1 + tox.ini | 23 ++++++++++++ 17 files changed, 135 insertions(+), 11 deletions(-) create mode 100644 docs/cleaning.rst create mode 100644 docs/linkage.rst create mode 100644 requirements.txt create mode 100644 requirements_dev.txt create mode 100644 src/semanticlayertools/cleaning/text.py rename src/semanticlayertools/cleaning/clean.py => tests/__init__.py (100%) create mode 100644 tests/cleaning/__init__.py create mode 100644 tests/cleaning/test_textcleaning.py create mode 100644 tests/linkage/__init__.py create mode 100644 tests/linkage/test_wordscore.py create mode 100644 tests/testdata/testdata.json create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore index ba732dc..48a75fd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ **build **dist *env +*testlab **.egg-info +*.tox +**__pycache__ diff --git a/docs/cleaning.rst b/docs/cleaning.rst new file mode 100644 index 0000000..c54eeec --- /dev/null +++ b/docs/cleaning.rst @@ -0,0 +1,6 @@ +Text and data cleaning +====================== + +.. automodule:: semanticlayertools.cleaning.text + :members: + :undoc-members: diff --git a/docs/conf.py b/docs/conf.py index 372d312..7ce9006 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,6 +24,8 @@ # The full version, including alpha/beta/rc tags release = '0.0.1' +master_doc = 'index' + # -- General configuration --------------------------------------------------- @@ -31,6 +33,8 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.intersphinx' ] # Add any paths that contain templates here, relative to this directory. @@ -52,4 +56,4 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] \ No newline at end of file +html_static_path = ['_static'] diff --git a/docs/index.rst b/docs/index.rst index e57bcff..747681b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,6 +12,9 @@ This project collects tools to build semantic layers from text corpora. :maxdepth: 2 :caption: Contents: + linkage + cleaning + Indices and tables diff --git a/docs/linkage.rst b/docs/linkage.rst new file mode 100644 index 0000000..441a5d1 --- /dev/null +++ b/docs/linkage.rst @@ -0,0 +1,6 @@ +Word scoring and linkage +======================== + +.. automodule:: semanticlayertools.linkage.wordscore + :members: + :undoc-members: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6d96cb6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +tqdm +nltk +numpy +spacy +pandas diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 0000000..053148f --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1 @@ +tox diff --git a/setup.cfg b/setup.cfg index 2113f09..4f861e3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,11 +19,13 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.6 +python_requires = >=3.7 install_requires = tqdm nltk numpy + spacy + pandas [options.packages.find] where = src diff --git a/src/semanticlayertools/cleaning/text.py b/src/semanticlayertools/cleaning/text.py new file mode 100644 index 0000000..8d7f2cb --- /dev/null +++ b/src/semanticlayertools/cleaning/text.py @@ -0,0 +1,36 @@ +import re +import spacy + +try: + nlp = spacy.load("en_core_web_lg") +except OSError: + pass + + + +def lemmaSpacy(text): + """Clean text in dataframe column.""" + try: + if isinstance(text, list): + text = text[0] + doc = nlp(text) + tokens = ' '.join( + [t.lemma_ for t in doc if not t.is_stop and len(t) > 3] + ) + return tokens.lower() + except: + return '' + + +def htmlTags(text): + """Remove html tags in text.""" + if isinstance(text, list): + text = text[0] + for tagPair in [ + ('', '_'), + ('', ''), + ('', '^'), + ('', '') + ]: + text = re.sub(tagPair[0], tagPair[1], text) + return text diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py index b7239e4..e19a69e 100644 --- a/src/semanticlayertools/linkage/wordscore.py +++ b/src/semanticlayertools/linkage/wordscore.py @@ -3,7 +3,6 @@ from collections import Counter, defaultdict from itertools import islice, combinations - from tqdm import tqdm import numpy as np import pandas as pd @@ -68,7 +67,7 @@ def getScore(self, target): "maxL": len(target), } - res = defaultdict(list()) + res = defaultdict(list) for idx, subgram in enumerate(target): key = idx + 1 @@ -91,7 +90,7 @@ def getScore(self, target): lvalue = len(list(set(res[leftkey]))) valueList.append((lvalue + 1) * (rvalue + 1)) return { - target: meta["counts"]/meta["corpusL"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"])) + target: 1/meta["counts"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"])) } def run(self, write=False, outpath='./'): @@ -120,10 +119,11 @@ class LinksOverTime(): Input: """ - def __init__(self, outputPath, scorePath, dataframe, scoreLimit=1.0, debug=False, windowSize=1): + def __init__(self, outputPath, scorePath, dataframe, authorColumn='authors', pubIDColumn="pubID", yearColumn='year', scoreLimit=1.0, debug=False, windowSize=1): self.dataframe = dataframe - self.authorCol = 'author' - self.pubIDCol = 'pubIDelm' + self.authorCol = authorColumn + self.pubIDCol = pubIDColumn + self.yearColumn = yearColumn self.scoreLimit = scoreLimit self.outpath = outputPath self.scorepath = scorePath @@ -147,7 +147,7 @@ def _window(self, seq): def _createSlices(self): slices = [] - years = sorted(self.dataframe.year.unique()) + years = sorted(self.dataframe[self.yearColumn].unique()) for x in self._window(years): slices.append(x) return slices @@ -156,7 +156,7 @@ def createNodeRegister(self, sl): """Create multilayer node register for time slice.""" if self.debug is True: print(f'Slice: {sl[0]}') - dataframe = self.dataframe[self.dataframe.year.isin(sl)] + dataframe = self.dataframe[self.dataframe[self.yearColumn].isin(sl)] dfNgramsList = [pd.read_csv( self.scorepath + str(slN) + '.tsv', sep='\t', @@ -195,7 +195,7 @@ def createNodeRegister(self, sl): def writeLinks(self, sl, recreate=False): """Write links to file.""" - dataframe = self.dataframe[self.dataframe.year.isin(sl)] + dataframe = self.dataframe[self.dataframe[self.yearColumn].isin(sl)] filePath = self.outpath + 'multilayerPajek_{0}.net'.format(sl[0]) if os.path.isfile(filePath): diff --git a/src/semanticlayertools/cleaning/clean.py b/tests/__init__.py similarity index 100% rename from src/semanticlayertools/cleaning/clean.py rename to tests/__init__.py diff --git a/tests/cleaning/__init__.py b/tests/cleaning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/cleaning/test_textcleaning.py b/tests/cleaning/test_textcleaning.py new file mode 100644 index 0000000..02b228a --- /dev/null +++ b/tests/cleaning/test_textcleaning.py @@ -0,0 +1,15 @@ +from semanticlayertools.cleaning.text import htmlTags, lemmaSpacy + + +def test_htmlclean(): + """Test removal of html tags.""" + testtext = "This He3 is really cool, super2 cool!" + resultString = "This He_3 is really cool, super^2 cool!" + assert htmlTags(testtext) == resultString + + +def test_lemmaSpacy(): + """Test lemmatizing with Spacy.""" + testtext = "In this paper we analyze the difficulties of gravity in rotating black holes." + resultString = "paper analyze difficulty gravity rotate black hole" + assert lemmaSpacy(testtext) == resultString diff --git a/tests/linkage/__init__.py b/tests/linkage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/linkage/test_wordscore.py b/tests/linkage/test_wordscore.py new file mode 100644 index 0000000..f40a60e --- /dev/null +++ b/tests/linkage/test_wordscore.py @@ -0,0 +1,19 @@ +import unittest +import os +import pandas as pd +from semanticlayertools.linkage.wordscore import CalculateScores + +basePath = os.path.dirname(os.path.abspath(__file__ + "/../")) +filePath = f'{basePath}/testdata/testdata.json' + +df = pd.read_json(filePath) + +class TestCalculateScores(unittest.TestCase): + + def setUp(self): + self.scoreinit = CalculateScores(df, textColumn='clean', pubIDColumn='pubIDs') + self.scorePattern = self.scoreinit.getTermPatterns() + self.scoreOut = self.scoreinit.run() + + def test_scoring(self): + self.assertLessEqual(self.scoreOut[0][('theory',)], 1) diff --git a/tests/testdata/testdata.json b/tests/testdata/testdata.json new file mode 100644 index 0000000..3f86d7d --- /dev/null +++ b/tests/testdata/testdata.json @@ -0,0 +1 @@ +{"clean":{"0":"The adsorption of halogens on metal films\u2014I Adsorption measurements and surface potentials for chlorine on nickel","1":"Some Applications of Power Law Analysis to Radioisotope Studies in Man","2":"This investigation consists of two different but related parts. 1. The thermionic properties of rhenium and rhenium with an adsorbed layer of thorium atoms, were investigated. The Richardson constants for rhenium were found to be \u03c6=4.85 eV, A=66 A per cm^2-deg^2. The work function decreases with increasing thorium coverage \u03c3 to a minimum of 3.15 eV at \u03c3=4.2\u00d710^14 atoms per sq cm then rises to a constant value of 3.3 eV at \u03c3=8\u00d710^14 atoms per sq cm. A comparison with the system tungsten\u2014thorium shows that: (a) \u03c6_min occurs at the same surface density in both cases, (b) \u03c6_const is attained at the same surface density in both cases, and this density is that of the atoms in a (100) plane of thorium metal, and (c) the values of \u03c6_const are equal and different from that of bulk thorium by only 0.1 eV. On the basis of these regularities a model is proposed to explain the variation of \u03c6 with \u03c3. 2. The thermal desorption of the adsorbed atoms was studied at temperatures between 2203\u00b0 and 2468\u00b0K. At low coverage the desorption rate per atom \u03b7_0, was found to be constant with coverage, and to obey the relation \u03b7_0=CT exp (-\u220a\/kT) with C=4\u00d710^10 sec^-1 deg^-1 and \u220a=8.30 eV. The desorption rate increased for coverages between 0.05 and 0.5 monolayer and this increase agreed well with the predictions of a theory based on interactions among the adsorbed atoms.","3":"The Theory of Space, Time and Gravitation","4":"500-Kv-Line Design","5":"In this paper we have considered certain problems which arise when one attempts to cast a covariant field theory into a canonical form. Because of the invariance properties of the theory, certain identities exist between the canonical field variables. To insure that the canonical theory is equivalent to the underlying lagrangian formalism one must require that these identities, once satisfied, will remain satisfied through the course of time. In general, this will be true only if additional constraints are set between the canonical variables. We have shown that only a finite number of such constraints exist and that they form a function group. Our proof rests essentially on the possibility of constructing a generating function for an infinitesimal canonical transformation that is equivalent to an invariant infinitesimal transformation on the lagrangian formalism. Once a hamiltonian is obtained by one of the procedures outlined in previous papers of this series, and the constraints have all been found, the consistent, invariant canonical formulation of the theory is completed. The main results of the paper have been formulated in such a manner as to make them applicable to a fairly general type of invariance. In the last sections we have applied these results to the cases of gauge and coordinate invariance. In the latter case a hamiltonian, corresponding to a quadratic lagrangian, has been constructed in a parameter-free form; and in both cases the constraints, together with the poisson bracket relations between them, have been obtained explicitly. As was to be expected, two constraints were found for a gauge-invariant theory and eight for a coordinate-invariant theory.","6":"Absolute change in general relativity","7":"Quantum Restrictions on the Measurability of Fields in Gravitational Theory","8":"Dyson has shown that the evaluation of the S matrix for quantum electrodynamics can be reduced to the problem of evaluating certain quantities, S^'F, D^'F, and \u0393_\u03bc. By making use of a formula relating the T product of an operator with its corresponding N product, integro-differential equations for S^'F and D^'F are obtained. These equations are identical in form with those given by Schwinger for his Green's functions, and hence it is concluded that the two formalisms are equivalent. In addition it is shown that all of the quantities introduced by Schwinger can be expressed in terms of a single quantity, S_vac, the vacuum expectation value of the S matrix. The renormalization problem is not discussed.","9":"In this paper we analyze the difficulties which occur when one attempts to quantize a theory such as electrodynamics or the general theory of relativity. Because of the invariance properties of theories of this type all of the canonical variables of the theory are not independent of one another but rather there exists certain algebraic relations between them called constraints. These constraints plus the Hamiltonian, in the unquantized version of the theory, constitute a function group. It is proved that there exists at least one ordering of factors in the quantized theory for which this is also true. From this fact we conclude that it is possible, at least formally, to construct a quantum version of the theories under consideration and that the quantum version will possess the same invariance properties as the corresponding unquantized theory."},"pubIDs":{"0":"10.1016\/0022-3697(60)90159-1","1":"10.1088\/0031-9155\/8\/3\/305","2":"10.1063\/1.1702725","3":"10.1063\/1.3051237","4":"10.1109\/TPAS.1963.291452","5":"10.1103\/PhysRev.83.1018","6":"Absolute change in general relativity","7":"Quantum Restrictions on the Measurability of Fields in Gravitational Theory","8":"10.1103\/PhysRev.94.703","9":"10.1103\/PhysRev.99.1009"}} \ No newline at end of file diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..5af3319 --- /dev/null +++ b/tox.ini @@ -0,0 +1,23 @@ +[tox] +envlist = py37,py38 +isolated_build = True + +[pytest] +minversion = 6.0 +addopts = -ra -q +testpaths = + tests + +[testenv] +deps = + pytest + -rrequirements.txt +commands_pre = python -m spacy download en_core_web_sm +commands = pytest {posargs} + +[testenv:docs] +description = invoke sphinx-build to build the HTML docs +basepython = python3.7 +deps = sphinx >= 1.7.5, < 2 +commands = sphinx-build -d "{toxworkdir}/docs_doctree" docs "{toxworkdir}/docs_out" --color -W -bhtml {posargs} + python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))' From bd14d0dfd07719c2b8e845c242abb283886301ee Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Wed, 29 Sep 2021 13:54:59 +0200 Subject: [PATCH 05/53] wip set readthedocs theme, add testing, fix linkage --- docs/conf.py | 2 +- src/semanticlayertools/cleaning/text.py | 5 +- src/semanticlayertools/linkage/wordscore.py | 87 +++++++++++++------- src/semanticlayertools/utils/__init__.py | 0 src/semanticlayertools/utils/wordscorenet.py | 1 + tox.ini | 4 +- 6 files changed, 65 insertions(+), 34 deletions(-) create mode 100644 src/semanticlayertools/utils/__init__.py create mode 100644 src/semanticlayertools/utils/wordscorenet.py diff --git a/docs/conf.py b/docs/conf.py index 7ce9006..8dbe953 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,7 +51,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/src/semanticlayertools/cleaning/text.py b/src/semanticlayertools/cleaning/text.py index 8d7f2cb..7f3889e 100644 --- a/src/semanticlayertools/cleaning/text.py +++ b/src/semanticlayertools/cleaning/text.py @@ -4,8 +4,7 @@ try: nlp = spacy.load("en_core_web_lg") except OSError: - pass - + nlp = spacy.load("en_core_web_sm") def lemmaSpacy(text): @@ -19,7 +18,7 @@ def lemmaSpacy(text): ) return tokens.lower() except: - return '' + raise def htmlTags(text): diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py index e19a69e..fee1dec 100644 --- a/src/semanticlayertools/linkage/wordscore.py +++ b/src/semanticlayertools/linkage/wordscore.py @@ -18,9 +18,27 @@ class CalculateScores(): """Calculates ngram scores for documents. - Considered parts of speech are (see NLTK docs for details) + Considered parts of speech are (see `nltk` docs for details) - Nouns: 'NN', 'NNS', 'NNP', 'NNPS' - Adjectives: 'JJ', 'JJR', 'JJS' + + All texts of the corpus are tokenized and POS tags are generated. + A global dictionary of counts of different ngrams is build in `allNGrams`. + The ngram relations of every text are listed in `outputDict`. + + Scoring counts occurance of different words left and right of each single + token in each ngram, weighted by ngram size. + + :param sourceDataframe: Dataframe containing the basic corpus + :type sourceDataframe: class:`pandas.DataFrame` + :param textColumn: Column name to use for ngram calculation + :type textColumn: str + :param pubIDColumn: Column name to use for publication identification (assumend to be unique) + :type pubIDColumn: str + :param yearColumn: Column name for temporal ordering publications, used during writing the scoring files + :type yearColumn: str + :param ngramsize: Maximum of considered ngrams (default: 5-gram) + :type ngramsize: int """ def __init__(self, sourceDataframe, textColumn="text", pubIDColumn="pubID", yearColumn='year', ngramsize=5,): @@ -106,37 +124,48 @@ def run(self, write=False, outpath='./'): self.outputDict.update({key: tmpList}) if write is True: for year, df in self.baseDF.groupby(self.yearCol): - with open(f'{outpath}{str(year)}.csv', 'a') as yearfile: + with open(f'{outpath}{str(year)}.tsv', 'a') as yearfile: for pub in df[self.pubIDCol].unique(): for elem in self.outputDict[pub]: - yearfile.write(f'{pub},{elem[0]},{elem[1]}') + yearfile.write(f'{pub}\t{elem[0]}\t{elem[1]}\n') return scores, self.outputDict class LinksOverTime(): - """To keep track of nodes over time, we need a global register of node names. + """Create multilayer pajek files for corpus. + + To keep track of nodes over time, we need a global register of node names. + This class takes care of this, by adding new keys of authors, papers or + ngrams to the register. - Input: + :param dataframe: Source dataframe containing metadata of texts + (authors, publicationID and year) + :type dataframe: class:`pandas.DataFrame` + :param authorColumn: Column name for author information + :param pubIDColumn: Column name to identify publications + :param yearColumn: Column name with year information """ - def __init__(self, outputPath, scorePath, dataframe, authorColumn='authors', pubIDColumn="pubID", yearColumn='year', scoreLimit=1.0, debug=False, windowSize=1): + def __init__( + self, + dataframe, + authorColumn='authors', + pubIDColumn="pubID", + yearColumn='year', + debug=False + ): self.dataframe = dataframe self.authorCol = authorColumn self.pubIDCol = pubIDColumn self.yearColumn = yearColumn - self.scoreLimit = scoreLimit - self.outpath = outputPath - self.scorepath = scorePath self.nodeMap = {} self.debug = debug - self.windowSize = windowSize - def _window(self, seq): + def _window(self, seq, n): """Return a sliding window (of width n) over data from the iterable. s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... """ - n = self.windowSize it = iter(seq) result = tuple(islice(it, n)) if len(result) == n: @@ -145,27 +174,27 @@ def _window(self, seq): result = result[1:] + (elem,) yield result - def _createSlices(self): + def _createSlices(self, windowsize): slices = [] years = sorted(self.dataframe[self.yearColumn].unique()) - for x in self._window(years): + for x in self._window(years, windowsize): slices.append(x) return slices - def createNodeRegister(self, sl): + def createNodeRegister(self, sl, scorePath, scoreLimit): """Create multilayer node register for time slice.""" if self.debug is True: print(f'Slice: {sl[0]}') dataframe = self.dataframe[self.dataframe[self.yearColumn].isin(sl)] dfNgramsList = [pd.read_csv( - self.scorepath + str(slN) + '.tsv', + scorePath + str(slN) + '.tsv', sep='\t', header=None ) for slN in sl] ngramdataframe = pd.concat(dfNgramsList) - ngramdataframe = ngramdataframe[ngramdataframe[2] > self.scoreLimit] + ngramdataframe = ngramdataframe[ngramdataframe[2] > scoreLimit] - authorList = [x for y in dataframe[self.authorCol].values for x in y] + authorList = [x for y in [x.split(';') for x in dataframe[self.authorCol].values] for x in y] authors = [x for x in set(authorList) if x] pubs = dataframe[self.pubIDCol].fillna('None').unique() @@ -193,10 +222,10 @@ def createNodeRegister(self, sl): ) ) - def writeLinks(self, sl, recreate=False): - """Write links to file.""" + def writeLinks(self, sl, scorePath, scoreLimit, outpath='./', recreate=False): + """Write multilayer links to file in Pajek format.""" dataframe = self.dataframe[self.dataframe[self.yearColumn].isin(sl)] - filePath = self.outpath + 'multilayerPajek_{0}.net'.format(sl[0]) + filePath = outpath + 'multilayerPajek_{0}.net'.format(sl[0]) if os.path.isfile(filePath): if recreate is False: @@ -207,12 +236,12 @@ def writeLinks(self, sl, recreate=False): os.remove(filePath) dfNgramsList = [pd.read_csv( - self.scorepath + str(slN) + '.tsv', + scorePath + str(slN) + '.tsv', sep='\t', header=None ) for slN in sl] ngramdataframe = pd.concat(dfNgramsList) - ngramdataframe = ngramdataframe[ngramdataframe[2] > self.scoreLimit] + ngramdataframe = ngramdataframe[ngramdataframe[2] > scoreLimit] with open(filePath, 'a') as file: file.write("# A network in a general multiplex format\n") @@ -226,7 +255,7 @@ def writeLinks(self, sl, recreate=False): if self.debug is True: print('\tWriting inter-layer links to file.') for _, row in dataframe.fillna('').iterrows(): - authors = row[self.authorCol] + authors = row[self.authorCol].split(';') paper = row[self.pubIDCol] if paper not in self.nodeMap.keys(): print(f'Cannot find {paper}') @@ -269,8 +298,8 @@ def writeLinks(self, sl, recreate=False): except KeyError: pass - def run(self, recreate=False): - """Create all data for slices.""" - for sl in tqdm(self._createSlices()): - self.createNodeRegister(sl) - self.writeLinks(sl, recreate=recreate) + def run(self, recreate=False, windowsize=1, scorePath='./', outPath='./', scoreLimit=1.0): + """Create data for all slices.""" + for sl in tqdm(self._createSlices(windowsize)): + self.createNodeRegister(sl, scorePath, scoreLimit) + self.writeLinks(sl, scorePath, scoreLimit, outpath=outPath, recreate=recreate) diff --git a/src/semanticlayertools/utils/__init__.py b/src/semanticlayertools/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/utils/wordscorenet.py new file mode 100644 index 0000000..41c2735 --- /dev/null +++ b/src/semanticlayertools/utils/wordscorenet.py @@ -0,0 +1 @@ +"""Runs all steps to create a multilayer network.""" diff --git a/tox.ini b/tox.ini index 5af3319..ea55a51 100644 --- a/tox.ini +++ b/tox.ini @@ -18,6 +18,8 @@ commands = pytest {posargs} [testenv:docs] description = invoke sphinx-build to build the HTML docs basepython = python3.7 -deps = sphinx >= 1.7.5, < 2 +deps = + sphinx >= 1.7.5, < 2 + sphinx_rtd_theme commands = sphinx-build -d "{toxworkdir}/docs_doctree" docs "{toxworkdir}/docs_out" --color -W -bhtml {posargs} python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))' From 1cd58bc084dc198e85ca24e90b21431560d5c0f8 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 30 Sep 2021 13:52:02 +0200 Subject: [PATCH 06/53] add clustering and util fct --- setup.cfg | 1 + src/semanticlayertools/clustering/__init__.py | 0 src/semanticlayertools/clustering/infomap.py | 45 +++++++++++++ src/semanticlayertools/linkage/wordscore.py | 13 +++- src/semanticlayertools/utils/wordscorenet.py | 67 +++++++++++++++++++ 5 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 src/semanticlayertools/clustering/__init__.py create mode 100644 src/semanticlayertools/clustering/infomap.py diff --git a/setup.cfg b/setup.cfg index 4f861e3..80ad574 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,6 +26,7 @@ install_requires = numpy spacy pandas + infomap [options.packages.find] where = src diff --git a/src/semanticlayertools/clustering/__init__.py b/src/semanticlayertools/clustering/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/semanticlayertools/clustering/infomap.py b/src/semanticlayertools/clustering/infomap.py new file mode 100644 index 0000000..375945e --- /dev/null +++ b/src/semanticlayertools/clustering/infomap.py @@ -0,0 +1,45 @@ +import os +from tqdm import tqdm +import infomap + + +class Clustering(): + """Cluster using infomap.""" + + def __init__( + self, + infomapSettings="-N5 -imultilayer -fundirected" + ): + self.infomult = infomap.Infomap(infomapSettings) + + def calcInfomap(self, inFilePath, outPath, recreate=False, debug=False): + """Calc clusters for one pajekt file.""" + year = inFilePath.split(os.path.sep)[-1].split('_')[1].split('.')[0] + cluFilePath = f'{outPath}slice_{year}.clu' + ftreeFilePath = f'{outPath}slice_{year}.ftree' + if os.path.isfile(cluFilePath) or os.path.isfile(ftreeFilePath): + if recreate is False: + raise IOError( + f'Files at {cluFilePath} or {ftreeFilePath} exists. Set recreate = True to rewrite files.' + ) + if recreate is True: + os.remove(cluFilePath) + os.remove(ftreeFilePath) + self.infomult.readInputData(inFilePath) + self.infomult.run() + self.infomult.writeClu(cluFilePath) + self.infomult.writeFlowTree(ftreeFilePath) + if debug: + print( + f"Clustered in {self.infomult.maxTreeDepth()} levels with codelength {self.infomult.codelength}" + ) + print("\tDone: Slice {0}!".format(year)) + return + + def run(self, pajekPath='./', outPath='./', recreate=False, debug=False): + """Calculate infomap clustering for all pajek files in path.""" + pajekFiles = sorted( + [pajekPath + x for x in os.listdir(pajekPath) if x.endswith('.net')] + ) + for file in tqdm(pajekFiles): + self.calcInfomap(inFilePath=file, outPath=outPath, debug=debug) diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py index fee1dec..222d7bd 100644 --- a/src/semanticlayertools/linkage/wordscore.py +++ b/src/semanticlayertools/linkage/wordscore.py @@ -111,7 +111,7 @@ def getScore(self, target): target: 1/meta["counts"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"])) } - def run(self, write=False, outpath='./'): + def run(self, write=False, outpath='./', recreate=False): """Get score for all documents.""" scores = {} self.getTermPatterns() @@ -124,7 +124,15 @@ def run(self, write=False, outpath='./'): self.outputDict.update({key: tmpList}) if write is True: for year, df in self.baseDF.groupby(self.yearCol): - with open(f'{outpath}{str(year)}.tsv', 'a') as yearfile: + filePath = f'{outpath}{str(year)}.tsv' + if os.path.isfile(filePath): + if recreate is False: + raise IOError( + f'File at {filePath} exists. Set recreate = True to rewrite file.' + ) + if recreate is True: + os.remove(filePath) + with open(filePath, 'a') as yearfile: for pub in df[self.pubIDCol].unique(): for elem in self.outputDict[pub]: yearfile.write(f'{pub}\t{elem[0]}\t{elem[1]}\n') @@ -195,7 +203,6 @@ def createNodeRegister(self, sl, scorePath, scoreLimit): ngramdataframe = ngramdataframe[ngramdataframe[2] > scoreLimit] authorList = [x for y in [x.split(';') for x in dataframe[self.authorCol].values] for x in y] - authors = [x for x in set(authorList) if x] pubs = dataframe[self.pubIDCol].fillna('None').unique() ngrams = ngramdataframe[1].unique() diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/utils/wordscorenet.py index 41c2735..56c00d1 100644 --- a/src/semanticlayertools/utils/wordscorenet.py +++ b/src/semanticlayertools/utils/wordscorenet.py @@ -1 +1,68 @@ """Runs all steps to create a multilayer network.""" +import tempfile +from datetime import datetime +import os + +from ..cleaning.text import htmlTags, lemmaSpacy +from ..linkage.wordscore import CalculateScores, LinksOverTime +from ..clustering.infomap import Clustering + + +def run( + dataframe, + tempFiles=True, + outPath='./', + textColumn='text', + authorColumn='author', + pubIDColumn='publicationID', + scoreLimit=1.0 +): + """Run all steps for multilayer network generation using wordscoring.""" + clean = dataframe[textColumn].apply(lambda row: lemmaSpacy(htmlTags(row))) + + dataframe.insert(0, 'clean', clean) + + score = CalculateScores( + dataframe, + textColumn='clean', + pubIDColumn=pubIDColumn + ) + links = LinksOverTime( + dataframe, + authorColumn=authorColumn, + pubIDColumn=pubIDColumn + ) + clusters = Clustering() + if tempFiles is True: + with tempfile.TemporaryDirectory() as tmpdirname: + sc, outDict = score.run( + write=True, outpath=f'{tmpdirname}/scores/', recreate=True + ) + links.run( + recreate=True, + scorePath=f'{tmpdirname}/scores/', + outPath=f'{tmpdirname}/links/', + scoreLimit=scoreLimit + ) + clusters.run( + pajekPath=f'{tmpdirname}/links/', + outPath=outPath, + ) + else: + timestamp = datetime.now().strftime("_%Y_%m_%d") + basedir = outPath + timestamp + for subdir in ['scores', 'links', 'clusters']: + os.makedirs(basedir + subdir) + sc, outDict = score.run( + write=True, outpath=f'{basedir}/scores/', recreate=True + ) + links.run( + recreate=True, + scorePath=f'{basedir}/scores/', + outPath=f'{basedir}/links/', + scoreLimit=scoreLimit + ) + clusters.run( + pajekPath=f'{basedir}/links/', + outPath=f'{basedir}/clusters', + ) From d478b6abb06afcd9dfdef9164fd2421519d50a8b Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 30 Sep 2021 17:37:46 +0200 Subject: [PATCH 07/53] wip update utils to run pipeline --- src/semanticlayertools/clustering/infomap.py | 2 +- src/semanticlayertools/utils/wordscorenet.py | 45 +++++++------------- 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/src/semanticlayertools/clustering/infomap.py b/src/semanticlayertools/clustering/infomap.py index 375945e..f566d68 100644 --- a/src/semanticlayertools/clustering/infomap.py +++ b/src/semanticlayertools/clustering/infomap.py @@ -8,7 +8,7 @@ class Clustering(): def __init__( self, - infomapSettings="-N5 -imultilayer -fundirected" + infomapSettings="-N5 -imultilayer -fundirected --silent" ): self.infomult = infomap.Infomap(infomapSettings) diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/utils/wordscorenet.py index 56c00d1..050495b 100644 --- a/src/semanticlayertools/utils/wordscorenet.py +++ b/src/semanticlayertools/utils/wordscorenet.py @@ -34,35 +34,22 @@ def run( ) clusters = Clustering() if tempFiles is True: - with tempfile.TemporaryDirectory() as tmpdirname: - sc, outDict = score.run( - write=True, outpath=f'{tmpdirname}/scores/', recreate=True - ) - links.run( - recreate=True, - scorePath=f'{tmpdirname}/scores/', - outPath=f'{tmpdirname}/links/', - scoreLimit=scoreLimit - ) - clusters.run( - pajekPath=f'{tmpdirname}/links/', - outPath=outPath, - ) + basedir = tempfile.TemporaryDirectory().name else: timestamp = datetime.now().strftime("_%Y_%m_%d") basedir = outPath + timestamp - for subdir in ['scores', 'links', 'clusters']: - os.makedirs(basedir + subdir) - sc, outDict = score.run( - write=True, outpath=f'{basedir}/scores/', recreate=True - ) - links.run( - recreate=True, - scorePath=f'{basedir}/scores/', - outPath=f'{basedir}/links/', - scoreLimit=scoreLimit - ) - clusters.run( - pajekPath=f'{basedir}/links/', - outPath=f'{basedir}/clusters', - ) + for subdir in ['scores', 'links', 'clusters']: + os.makedirs(os.path.join(basedir, subdir)) + sc, outDict = score.run( + write=True, outpath=f'{basedir}/scores/', recreate=True + ) + links.run( + recreate=True, + scorePath=f'{basedir}/scores/', + outPath=f'{basedir}/links/', + scoreLimit=scoreLimit + ) + clusters.run( + pajekPath=f'{basedir}/links/', + outPath=f'{outPath}', + ) From f1c4c41e4346ec1c7516d4071948ed5b5a01794b Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Fri, 1 Oct 2021 13:52:30 +0200 Subject: [PATCH 08/53] wip updt origin, data testing complete, no tests written for clustering yet --- src/semanticlayertools/utils/wordscorenet.py | 30 +++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/utils/wordscorenet.py index 050495b..2aae469 100644 --- a/src/semanticlayertools/utils/wordscorenet.py +++ b/src/semanticlayertools/utils/wordscorenet.py @@ -13,6 +13,7 @@ def run( tempFiles=True, outPath='./', textColumn='text', + yearColumn='year', authorColumn='author', pubIDColumn='publicationID', scoreLimit=1.0 @@ -30,17 +31,20 @@ def run( links = LinksOverTime( dataframe, authorColumn=authorColumn, - pubIDColumn=pubIDColumn + pubIDColumn=pubIDColumn, + yearColumn=yearColumn ) clusters = Clustering() if tempFiles is True: basedir = tempfile.TemporaryDirectory().name + clusterout = outPath else: timestamp = datetime.now().strftime("_%Y_%m_%d") - basedir = outPath + timestamp + basedir = outPath + 'Clustering' + timestamp + clusterout = f'{basedir}/clusters/' for subdir in ['scores', 'links', 'clusters']: os.makedirs(os.path.join(basedir, subdir)) - sc, outDict = score.run( + score.run( write=True, outpath=f'{basedir}/scores/', recreate=True ) links.run( @@ -51,5 +55,23 @@ def run( ) clusters.run( pajekPath=f'{basedir}/links/', - outPath=f'{outPath}', + outPath=clusterout, ) + with open(f'{basedir}/README.txt', 'w+') as file: + file.write( + f"""Run of clustering {datetime.now().strftime("%Y_%m_%d")} + + Text cleaned in column: {textColumn} (html tags removed and lemmatized) + Authors information from column: {authorColumn} + Unique publication IDs from columns: {pubIDColumn} + Ngram scores greater {scoreLimit} were considered for link creation. + Clustering result in folder: {clusterout} + """ + ) + if tempFiles is True: + file.write( + 'Temporay files for wordscores and multilayer network were deleted.' + ) + print(f"""Results in {clusterout}.\n + Head over to https://www.mapequation.org/alluvial/ to visualize the ftree files. + """) From 1ba08e4dd32f4fffd9e93b2a89baa5b29f19dbb5 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 4 Oct 2021 16:31:57 +0200 Subject: [PATCH 09/53] fix multiprocessing --- src/semanticlayertools/linkage/wordscore.py | 75 +++++++++++--------- src/semanticlayertools/utils/wordscorenet.py | 36 +++++++--- 2 files changed, 65 insertions(+), 46 deletions(-) diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py index 222d7bd..84bdaa7 100644 --- a/src/semanticlayertools/linkage/wordscore.py +++ b/src/semanticlayertools/linkage/wordscore.py @@ -2,7 +2,7 @@ import re from collections import Counter, defaultdict from itertools import islice, combinations - +from multiprocessing import Pool, cpu_count from tqdm import tqdm import numpy as np import pandas as pd @@ -41,7 +41,15 @@ class CalculateScores(): :type ngramsize: int """ - def __init__(self, sourceDataframe, textColumn="text", pubIDColumn="pubID", yearColumn='year', ngramsize=5,): + def __init__( + self, + sourceDataframe, + textColumn="text", + pubIDColumn="pubID", + yearColumn='year', + ngramsize=5, + debug=False + ): self.baseDF = sourceDataframe self.textCol = textColumn @@ -51,8 +59,9 @@ def __init__(self, sourceDataframe, textColumn="text", pubIDColumn="pubID", year self.outputDict = {} self.allNGrams = [] self.counts = {} - self.allgramslist = [] + self.corpussize = 1 self.uniqueNGrams = () + self.debug=debug def getTermPatterns(self): """Create dictionaries of occuring ngrams.""" @@ -72,51 +81,47 @@ def getTermPatterns(self): self.outputDict[row[self.pubIDCol]] = tempNGram self.allNGrams = allNGrams allgrams = [x for y in [y for x, y in self.allNGrams.items()] for x in y] - self.allgramslist = allgrams + self.corpussize = len(allgrams) self.counts = Counter(allgrams) self.uniqueNGrams = set(allgrams) def getScore(self, target): """Calculate ngram score.""" - meta = { - "target": target, - "counts": self.counts[target], - "corpusL": len(self.allgramslist), - "maxL": len(target), - } - - res = defaultdict(list) - - for idx, subgram in enumerate(target): - key = idx + 1 - for tup in self.allNGrams[2]: - if tup[1:][0] == subgram: - res[f"l_{key}"].append(tup[:1][0]) - elif tup[:-1][0] == subgram: - res[f"r_{key}"].append(tup[1:][0]) valueList = [] - for L in range(1, meta["maxL"] + 1, 1): - leftkey = f"l_{L}" - rightkey = f"r_{L}" - if rightkey not in res.keys(): - rvalue = 0 - else: - rvalue = len(list(set(res[rightkey]))) - if leftkey not in res.keys(): - lvalue = 0 - else: - lvalue = len(list(set(res[leftkey]))) + for _, subgram in enumerate(target): + contains = [x for x in self.allNGrams[2] if subgram in x] + rvalue = len(set(x for x in contains if x[0] == subgram)) + lvalue = len(set(x for x in contains if x[1] == subgram)) valueList.append((lvalue + 1) * (rvalue + 1)) return { - target: 1/meta["counts"] * (np.prod(valueList)) ** (1 / (2.0 * meta["maxL"])) + target: 1/self.counts[target] * (np.prod(valueList)) ** (1 / (2.0 * len(target))) } - def run(self, write=False, outpath='./', recreate=False): + def _calcBatch(self, batch): + res = [] + for elem in tqdm(batch): + res.append(self.getScore(elem)) + return res + + def run(self, write=False, outpath='./', recreate=False, limitCPUs=True): """Get score for all documents.""" scores = {} self.getTermPatterns() - for target in tqdm(self.uniqueNGrams): - scores.update(self.getScore(target)) + if self.debug is True: + print(f'Found {len(self.uniqueNGrams)} unique {self.ngramEnd}-grams.') + if limitCPUs is True: + ncores = int(cpu_count()*1/4) + else: + ncores = cpu_count() - 2 + pool = Pool(ncores) + chunk_size = int(len(self.uniqueNGrams)/ncores) + batches = [ + list(self.uniqueNGrams)[i:i+chunk_size] for i in range(0, len(self.uniqueNGrams), chunk_size) + ] + ncoresResults = pool.map(self._calcBatch, batches) + results = [x for y in ncoresResults for x in y] + for elem in results: + scores.update(elem) for key, val in self.outputDict.items(): tmpList = [] for elem in val: diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/utils/wordscorenet.py index 2aae469..5996fda 100644 --- a/src/semanticlayertools/utils/wordscorenet.py +++ b/src/semanticlayertools/utils/wordscorenet.py @@ -16,17 +16,33 @@ def run( yearColumn='year', authorColumn='author', pubIDColumn='publicationID', + ngramsize=5, scoreLimit=1.0 ): """Run all steps for multilayer network generation using wordscoring.""" + + if tempFiles is True: + basedir = tempfile.TemporaryDirectory().name + clusterout = outPath + else: + timestamp = datetime.now().strftime("_%Y_%m_%d") + basedir = outPath + 'Clustering' + timestamp + clusterout = f'{basedir}/clusters/' + for subdir in ['scores', 'links', 'clusters']: + os.makedirs(os.path.join(basedir, subdir)) + print(f'Start cleaning {textColumn} column.') clean = dataframe[textColumn].apply(lambda row: lemmaSpacy(htmlTags(row))) dataframe.insert(0, 'clean', clean) + if tempFiles is False: + dataframe.to_json(f'{basedir}/sourceDFcleaned.json', orient='records', lines=True) + print('\tDone.') score = CalculateScores( dataframe, textColumn='clean', - pubIDColumn=pubIDColumn + pubIDColumn=pubIDColumn, + ngramsize=ngramsize ) links = LinksOverTime( dataframe, @@ -35,28 +51,26 @@ def run( yearColumn=yearColumn ) clusters = Clustering() - if tempFiles is True: - basedir = tempfile.TemporaryDirectory().name - clusterout = outPath - else: - timestamp = datetime.now().strftime("_%Y_%m_%d") - basedir = outPath + 'Clustering' + timestamp - clusterout = f'{basedir}/clusters/' - for subdir in ['scores', 'links', 'clusters']: - os.makedirs(os.path.join(basedir, subdir)) + + print(f'Start calculating scores for {dataframe.shape[0]} texts.') score.run( write=True, outpath=f'{basedir}/scores/', recreate=True ) + print('\tDone.') + print(f'Start creating links with scoreLimit > {scoreLimit}.') links.run( recreate=True, scorePath=f'{basedir}/scores/', outPath=f'{basedir}/links/', scoreLimit=scoreLimit ) + print('\tDone.') + print('Start calculating infomap clusters.') clusters.run( pajekPath=f'{basedir}/links/', outPath=clusterout, ) + print('\tDone.') with open(f'{basedir}/README.txt', 'w+') as file: file.write( f"""Run of clustering {datetime.now().strftime("%Y_%m_%d")} @@ -70,7 +84,7 @@ def run( ) if tempFiles is True: file.write( - 'Temporay files for wordscores and multilayer network were deleted.' + 'Temporay files for wordscores and multilayer networks were deleted.' ) print(f"""Results in {clusterout}.\n Head over to https://www.mapequation.org/alluvial/ to visualize the ftree files. From 7c983f8470acbb84a828751923c9398e82d35c1f Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 9 Dec 2021 18:13:59 +0100 Subject: [PATCH 10/53] add routine for cocitations --- src/semanticlayertools/linkage/cocitation.py | 66 ++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 src/semanticlayertools/linkage/cocitation.py diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py new file mode 100644 index 0000000..12b5056 --- /dev/null +++ b/src/semanticlayertools/linkage/cocitation.py @@ -0,0 +1,66 @@ +"""Link documents by cocitation.""" +import os +import time +import tempfile +import multiprocessing +from itertools import combinations +from collections import Counter + +import pandas as pd +import numpy as np + +num_processes = multiprocessing.cpu_count() + + +class Cocitations(): + """Cocitation calculations.""" + + def __init__( + self, inpath, outpath, columnName, numberProc=num_processes, debug=False + ): + self.inpath = inpath + self.outpath = outpath + self.columnName = columnName + self.numberProc = numberProc + self.debug = debug + + def getCombinations(self, chunk): + """Calculate combinations.""" + res = [] + for idx, row in chunk.iterrows(): + comb = combinations(row[self.columnName], 2) + for elem in list(comb): + res.append((elem)) + return res + + def calculateCoCitation(self, filepath): + """Do calculation for input file.""" + infilename = filepath.split(os.path.sep)[-1].split('.')[0] + starttime = time.time() + try: + data = pd.read_json(filepath, lines=True).dropna(subset=[self.columnName]) + chunk_size = int(data.shape[0] / self.numberProc) + chunks = np.array_split(data, chunk_size) + pool = multiprocessing.Pool(processes=self.numberProc) + cocitations = pool.map(self.getCombinations, chunks) + cocitCounts = Counter([x for y in cocitations for x in y]) + sortCoCitCounts = cocitCounts.most_common() + with open(self.outpath + infilename + '.csv', 'w') as outfile: + for edge in sortCoCitCounts: + outfile.write(f"{edge[0][0]},{edge[0][1]},{edge[1]}\n") + except: + raise + if self.debug == "l2": + print(f'\tDone in {starttime - time.time()} seconds.') + return + + def processFolder(self): + """Calculate cocitation for all files in folder.""" + starttime = time.time() + for file in os.listdir(self.inpath): + try: + self.calculateCoCitation(os.path.join(self.inpath, file)) + except: + raise + if self.debug is True: + print(f'\tDone in {starttime - time.time()} seconds.') From baea9a12909570ccc4a675fb06527f2442776aa9 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Fri, 10 Dec 2021 13:46:59 +0100 Subject: [PATCH 11/53] add giant component writing --- src/semanticlayertools/linkage/cocitation.py | 40 ++++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py index 12b5056..58ffa9f 100644 --- a/src/semanticlayertools/linkage/cocitation.py +++ b/src/semanticlayertools/linkage/cocitation.py @@ -1,33 +1,45 @@ """Link documents by cocitation.""" import os import time -import tempfile import multiprocessing from itertools import combinations from collections import Counter +from typing import TypeVar +import igraph as ig import pandas as pd import numpy as np num_processes = multiprocessing.cpu_count() +limitRefLength = TypeVar('limitRefLength', bool, int) +debugVar = TypeVar('debugVar', bool, str) class Cocitations(): """Cocitation calculations.""" def __init__( - self, inpath, outpath, columnName, numberProc=num_processes, debug=False + self, inpath, outpath, columnName, + numberProc: int=num_processes, limitRefLength: limitRefLength=False, debug: debugVar=False, ): self.inpath = inpath self.outpath = outpath self.columnName = columnName self.numberProc = numberProc + self.limitRefLength = limitRefLength self.debug = debug def getCombinations(self, chunk): """Calculate combinations.""" res = [] - for idx, row in chunk.iterrows(): + if type(self.limitRefLength) == int: + reflen = chunk[self.columnName].apply( + lambda x: True if type(x)==list and len(x)<=self.limitRefLength else False + ) + data = chunk[reflen].copy() + else: + data = chunk.copy() + for idx, row in data.iterrows(): comb = combinations(row[self.columnName], 2) for elem in list(comb): res.append((elem)) @@ -44,14 +56,26 @@ def calculateCoCitation(self, filepath): pool = multiprocessing.Pool(processes=self.numberProc) cocitations = pool.map(self.getCombinations, chunks) cocitCounts = Counter([x for y in cocitations for x in y]) - sortCoCitCounts = cocitCounts.most_common() - with open(self.outpath + infilename + '.csv', 'w') as outfile: + sortCoCitCounts = [ + (x[0][0], x[0][1], x[1]) for x in cocitCounts.most_common() + ] + tempG = ig.Graph.TupleList(sortCoCitCounts, weights=True, vertex_name_attr='id') + components = tempG.components() + sortedComponents = sorted( + [(x, len(x)) for x in components], key=lambda x: x[1], reverse=True + ) + giantComponent = sortedComponents[0] + giantComponentGraph = tempG.vs.select(giantComponent[0]).subgraph() + giantComponentGraph.write_pajek( + os.path.join(self.outpath,infilename + '_GC.net') + ) + with open(os.path.join(self.outpath,infilename + '.ncol'), 'w') as outfile: for edge in sortCoCitCounts: - outfile.write(f"{edge[0][0]},{edge[0][1]},{edge[1]}\n") + outfile.write(f"{edge[0]} {edge[1]} {edge[2]}\n") except: raise if self.debug == "l2": - print(f'\tDone in {starttime - time.time()} seconds.') + print(f'\tDone in {time.time() - starttime} seconds.') return def processFolder(self): @@ -63,4 +87,4 @@ def processFolder(self): except: raise if self.debug is True: - print(f'\tDone in {starttime - time.time()} seconds.') + print(f'\tDone in {time.time() - starttime} seconds.') From 532e56ef21b58239928e6c7bde7afe0cd4356a07 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Wed, 15 Dec 2021 07:15:12 +0100 Subject: [PATCH 12/53] upd orig --- src/semanticlayertools/linkage/cocitation.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py index 58ffa9f..a63e967 100644 --- a/src/semanticlayertools/linkage/cocitation.py +++ b/src/semanticlayertools/linkage/cocitation.py @@ -9,6 +9,7 @@ import igraph as ig import pandas as pd import numpy as np +from tqdm import tqdm num_processes = multiprocessing.cpu_count() @@ -62,8 +63,16 @@ def calculateCoCitation(self, filepath): tempG = ig.Graph.TupleList(sortCoCitCounts, weights=True, vertex_name_attr='id') components = tempG.components() sortedComponents = sorted( - [(x, len(x)) for x in components], key=lambda x: x[1], reverse=True + [(x, len(x), len(x)*100/len(tempG.vs)) for x in components], key=lambda x: x[1], reverse=True ) + with open(os.path.join(self.outpath,infilename + '_graphMetadata.txt'), 'w') as outfile: + outfile.write(f'Graph derived from {filepath}\nSummary:\n') + outfile.write(tempG.summary() + '\n\nComponents (ordered by size):\n\n') + for idx, elem in enumerate(sortedComponents): + gcompTemp = tempG.vs.select(elem[0]).subgraph() + outfile.write( + f"{idx}:\n\t{elem[1]} nodes ({elem[2]:.3f}% of full graph)\n\t{len(gcompTemp.es)} edges ({len(gcompTemp.es)*100/len(tempG.es):.3f}% of full graph)\n\n" + ) giantComponent = sortedComponents[0] giantComponentGraph = tempG.vs.select(giantComponent[0]).subgraph() giantComponentGraph.write_pajek( @@ -81,7 +90,7 @@ def calculateCoCitation(self, filepath): def processFolder(self): """Calculate cocitation for all files in folder.""" starttime = time.time() - for file in os.listdir(self.inpath): + for file in tqdm(os.listdir(self.inpath)): try: self.calculateCoCitation(os.path.join(self.inpath, file)) except: From c885446ef4dbb740ffb08de1c1f8444e662a133c Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Wed, 15 Dec 2021 15:23:00 +0100 Subject: [PATCH 13/53] add leiden time clusters and streamgraph visuals, WIP --- setup.cfg | 5 +- src/semanticlayertools/clustering/leiden.py | 118 +++++++++++++++++++ src/semanticlayertools/linkage/cocitation.py | 2 +- src/semanticlayertools/visual/__init__.py | 0 src/semanticlayertools/visual/utils.py | 38 ++++++ 5 files changed, 161 insertions(+), 2 deletions(-) create mode 100644 src/semanticlayertools/clustering/leiden.py create mode 100644 src/semanticlayertools/visual/__init__.py create mode 100644 src/semanticlayertools/visual/utils.py diff --git a/setup.cfg b/setup.cfg index 80ad574..f990da3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = semanticlayertools -version = 0.0.1 +version = 0.0.3 author = Malte Vogl author_email = mvogl@mpiwg-berlin.mpg.de description = Create semantic layers using different methods for word linking. @@ -22,11 +22,14 @@ packages = find: python_requires = >=3.7 install_requires = tqdm + matplotlib nltk numpy spacy pandas infomap + igraph + leidenalg [options.packages.find] where = src diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py new file mode 100644 index 0000000..31b43ea --- /dev/null +++ b/src/semanticlayertools/clustering/leiden.py @@ -0,0 +1,118 @@ +import os +import time +import re +from typing import TypeVar + +from tqdm import tqdm + +import igraph as ig +import leidenalg as la + +debugVar = TypeVar('debugVar', bool, str) + + +class TimeCluster(): + """Cluster time-sliced data with the Leiden algorithm.""" + + def __init__( + self, inpath: str, outpath: str, + resolution: float = 0.003, intersliceCoupling: float = 0.4, + timerange: tuple = (1945, 2005), + debug: debugVar = False + ): + starttime = time.time() + self.inpath = inpath + self.outpath = outpath + self.res_param = resolution + self.interslice_param = intersliceCoupling + self.timerange = timerange + self.debug = debug + + self.outfile = os.path.join( + outpath, + f'timeclusters_{timerange[0]}-{timerange[1]}_res_{resolution}_intersl_{intersliceCoupling}.csv' + ) + if os.path.isfile(self.outfile): + raise OSError('Output file exists. Please remove.') + + edgefiles = [x for x in os.listdir(inpath) if x.endswith('_GC.net')] + + self.graphDict = {} + + for idx in tqdm(range(len(edgefiles))): + try: + year = re.findall(r'\d{4}', edgefiles[idx])[0] + except: + raise + if timerange[0] <= int(year) <= timerange[1]: + graph = ig.Graph.Read_Pajek(os.path.join(inpath, edgefiles[idx])) + self.graphDict[year] = graph + + self.optimiser = la.Optimiser() + + print( + "Graphs between " + f"{min(list(self.graphDict.keys()))} and " + f"{max(list(self.graphDict.keys()))} " + f"loaded in {time.time() - starttime} seconds." + ) + + def optimize(self): + """Optimize clusters accross time slices.""" + starttime = time.time() + + layers, interslice_layer, _ = la.time_slices_to_layers( + list(self.graphDict.values()), + interslice_weight=self.interslice_param, + vertex_id_attr='name' + ) + print('\tHave set layers.') + + partitions = [ + la.CPMVertexPartition( + H, + node_sizes='node_size', + weights='weight', + resolution_parameter=self.res_param + ) for H in layers + ] + print('\tHave set partitions.') + + interslice_partition = la.CPMVertexPartition( + interslice_layer, + resolution_parameter=0, + node_sizes='node_size', + weights='weight' + ) + print('\tHave set interslice partions.') + + self.optimiser.optimise_partition_multiplex( + partitions + [interslice_partition] + ) + + subgraphs = interslice_partition.subgraphs() + + commun = [] + for idx, part in enumerate(subgraphs): + nodevals = [ + ( + x['name'], + list(self.graphDict.keys()).pop(x['slice']), + idx + ) for x in part.vs + ] + commun.extend(nodevals) + + with open(self.outfile, 'w') as outfile: + outfile.write('node,year,cluster\n') + for elem in commun: + outfile.write( + f"{elem[0]},{elem[1]},{elem[2]}\n" + ) + + print( + f'Finished in {time.time() - starttime} seconds.' + f"Found {len(subgraphs)} clusters." + ) + + return commun diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py index a63e967..a78a745 100644 --- a/src/semanticlayertools/linkage/cocitation.py +++ b/src/semanticlayertools/linkage/cocitation.py @@ -17,7 +17,7 @@ debugVar = TypeVar('debugVar', bool, str) class Cocitations(): - """Cocitation calculations.""" + """Create cocitation networks.""" def __init__( self, inpath, outpath, columnName, diff --git a/src/semanticlayertools/visual/__init__.py b/src/semanticlayertools/visual/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py new file mode 100644 index 0000000..a2eff8d --- /dev/null +++ b/src/semanticlayertools/visual/utils.py @@ -0,0 +1,38 @@ +import matplotlib.pyplot as plt +import pandas as pd + + +def streamgraph(filepath): + """Plot streamgraph of cluster sizes vs years.""" + basedf = pd.read_csv(filepath) + basedata = basedf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index() + yearbase = [ + str(x) for x in range( + int(basedata.year.min()), int(basedata.year.max()) + 1 + ) + ] + cluDict = {} + for clu in basedata.cluster.unique(): + cluvec = [] + basedf = basedata.query('cluster == @clu') + baseyears = basedf.year.unique() + for year in yearbase: + if year in baseyears: + cluvec.append(basedf.query('year == @year').counts.iloc[0]) + else: + cluvec.append(0) + cluDict[clu] = cluvec + + fig, ax = plt.subplots(figsize=(10, 7)) + ax.stackplot( + yearbase, + cluDict.values(), + labels=cluDict.keys(), + baseline='sym' + ) + ax.set_title('Cluster sizes') + ax.set_xlabel('Year') + ax.set_ylabel('Number of publications') + ax.axhline(0, color="black", ls="--") + + return fig From b481ef905e183c4f2e332b987458c07c92bb50f4 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 16 Dec 2021 14:46:28 +0100 Subject: [PATCH 14/53] finish streamgraph, add routine for reports, WIP --- setup.cfg | 2 + src/semanticlayertools/clustering/reports.py | 169 ++++++++++++++++++ .../{utils => pipelines}/__init__.py | 0 .../{utils => pipelines}/wordscorenet.py | 0 src/semanticlayertools/visual/utils.py | 82 ++++++--- tox.ini | 2 +- 6 files changed, 232 insertions(+), 23 deletions(-) create mode 100644 src/semanticlayertools/clustering/reports.py rename src/semanticlayertools/{utils => pipelines}/__init__.py (100%) rename src/semanticlayertools/{utils => pipelines}/wordscorenet.py (100%) diff --git a/setup.cfg b/setup.cfg index f990da3..9a67afc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,7 +25,9 @@ install_requires = matplotlib nltk numpy + scipy spacy + textacy pandas infomap igraph diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py new file mode 100644 index 0000000..5110510 --- /dev/null +++ b/src/semanticlayertools/clustering/reports.py @@ -0,0 +1,169 @@ +import re +import os +from tqdm import tqdm + +import spacy +import textacy +import textacy.tm +import pandas as pd +import multiprocessing + +num_processes = multiprocessing.cpu_count() + +mainLanguageCorp = 'en_core_web_lg' +nlp = spacy.load(mainLanguageCorp) + + +class ClusterReports(): + + def __init__( + self, infile:str, metadatapath:str, outpath:str, iteration: int, + numberProc: int=num_processes, minClusterSize: int=1000 + ): + self.iteration = iteration + self.numberProc = numberProc + self.minClusterSize = minClusterSize + self.metadatapath = metadatapath + self.outpath = outpath + + self.clusterdf = pd.read_csv(infile) + basedata = self.clusterdf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index() + largeClusterList = list(basedata.groupby('cluster').sum().query(f'counts > {self.minClusterSize}').index) + + self.clusternodes = self.clusterdf.query( + 'cluster in @largeClusterList' + ) + + def create_corpus(self, dataframe): + """Create corpus out of dataframe.""" + docs = [] + titles = [x[0] for x in dataframe.title.values if type(x) == list] + for title in tqdm(titles): + try: + # text pre-processing + title = re.sub("\n", " ", title) + title = re.sub("[\r|\t|\x0c|\d+]", "", title) + title = re.sub("[.,]", "", title) + title = re.sub("\\\'s", "'s", title) + title = title.lower() + + doc = nlp(title) + + tokens_without_sw = ' '.join([t.lemma_ for t in doc if not t.is_stop]) + + docs.append(tokens_without_sw) + except: + print(title) + raise + + corpus_titles = textacy.Corpus(mainLanguageCorp, data=docs) + return corpus_titles + + + def find_topics(self, corpus_titles: list, n_topics: int, top_words: int, outpath:str='./', writeReport: bool=False): + """Calculate topics in corpus.""" + vectorizer = textacy.representations.vectorizers.Vectorizer( + tf_type="linear", + idf_type="smooth", + norm="l2", + min_df=2, + max_df=0.95 + ) + tokenized_docs = ( + ( + term.lemma_ for term in textacy.extract.terms(doc, ngs=1, ents=True) + ) for doc in corpus_titles + ) + doc_term_matrix = vectorizer.fit_transform(tokenized_docs) + + model = textacy.tm.TopicModel("nmf", n_topics) + model.fit(doc_term_matrix) + + doc_topic_matrix = model.transform(doc_term_matrix) + + topics = [] + for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_words): + topics.append("topic " + str(topic_idx) + ": " + " ".join(top_terms)) + if writeReport is True: + outfile.write(f'\n\n\tTopics in cluster for {n_topics} topics:\n') + for topic in topics: + outfile.write(f'\t\t{topic}\n') + else: + print("\nTopics in the cluster:\n") + return topics + + def fullReport(self, cluster): + """Generate full cluster report.""" + with open(f'{outFolderReports}Report_{cluster}.txt', 'a') as outfile: + selection = self.clusterdf.query('cluster == @cluster') + nodeList = list(set(selection.node.values)) + starttime = time.time() + result = {} + resultMeta = [] + for key, vals in groupby(sorted(nodeList), lambda x: x[:4]): + result[int(key)] = list(vals) + + for key in result.keys(): + if key > 1949 and key < 2005 and key != 1996: + yeardata = pd.read_json(f'{inFolderMetadata}{key}_meta.json', lines=True) + selectNodedata = yeardata[yeardata.nodeID.isin(result[key])] + resultMeta.append(selectNodedata) + + metadata = pd.concat(resultMeta) + metadata.to_json( + f'{outFolderReports}meta/cluster_{cluster}_meta.json', + orient='records', + lines=True + ) + foundNodes = [x[0] for x in metadata.bibcode.values] + notFound = [x for x in nodeList if x not in foundNodes] + + outfile.write( + f'\tGot {len(nodeList)} unique publications in time range:\ + {selection.year.min()} to {selection.year.max()}.\n' + ) + outfile.write( + f'\t\tFound metadata for {metadata.shape[0]} publications.\n' + ) + outfile.write( + f'\t\tThere are {len([x for x in foundNodes if x not in nodeList])}\ + found publications which where NOT in the query list.\n' + ) + outfile.write( + f'\t\tThere are {len(notFound)} publication(s) which where NOT found:\n' + ) + + topAuthors = Counter( + [x for y in [x for x in metadata.author.values if type(x) == list] for x in y] + ).most_common(20) + outfile.write('\n\tThe top authors of this cluster are:\n') + for elem in topAuthors: + outfile.write(f'\t\t{elem[0]}: {elem[1]} pubs\n.') + topAffils = Counter( + [x for y in [x for x in metadata.aff.values if type(x) == list] for x in y] + ).most_common(20) + outfile.write('\n\tThe top 20 affiliations of this cluster are:\n') + for elem in topAffils: + outfile.write(f'\t\t{elem[0]}: {elem[1]} authors.\n') + outfile.write( + f'\n\n\tFinished analysis of cluster {cluster} with {len(nodeList)}\ + unique publications in {time.time()- starttime} seconds.\n\n' + ) + corpus = create_corpus(metadata) + find_topics( + corpus, n_topics=15, top_words=10, writeReport=True, outfile=outfile + ) + find_topics( + corpus, n_topics=50, top_words=10, writeReport=True, outfile=outfile + ) + outfile.write( + f'\n\n\tFinished analysis of topics in {cluster} in {time.time()- starttime} seconds.\n\n' + ) + return cluster + + def processClusters(self, publicationIDcolumn: str='nodeID'): + for filename in tqdm(os.listdir(self.metadatapath)): + filepath = os.path.join(self.metadatapath, filename) + data = pd.read_json(filepath, lines=True) + selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner') + selectMerge.to_json(os.path.join(self.outpath, 'merge_' + filename) , orient='records', lines=True) diff --git a/src/semanticlayertools/utils/__init__.py b/src/semanticlayertools/pipelines/__init__.py similarity index 100% rename from src/semanticlayertools/utils/__init__.py rename to src/semanticlayertools/pipelines/__init__.py diff --git a/src/semanticlayertools/utils/wordscorenet.py b/src/semanticlayertools/pipelines/wordscorenet.py similarity index 100% rename from src/semanticlayertools/utils/wordscorenet.py rename to src/semanticlayertools/pipelines/wordscorenet.py diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py index a2eff8d..0f79bdf 100644 --- a/src/semanticlayertools/visual/utils.py +++ b/src/semanticlayertools/visual/utils.py @@ -1,38 +1,76 @@ import matplotlib.pyplot as plt import pandas as pd +import numpy as np +from scipy import stats +from typing import TypeVar +smoothing = TypeVar('smoothing', bool, float) -def streamgraph(filepath): - """Plot streamgraph of cluster sizes vs years.""" + +def gaussian_smooth(x, y, grid, sd): + weights = np.transpose([stats.norm.pdf(grid, m, sd) for m in x]) + weights = weights / weights.sum(0) + return (weights * y).sum(1) + + +def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000, showNthGrid: int=5): + """Plot streamgraph of cluster sizes vs years. + + Based on https://www.python-graph-gallery.com/streamchart-basic-matplotlib + """ basedf = pd.read_csv(filepath) basedata = basedf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index() yearbase = [ - str(x) for x in range( + x for x in range( int(basedata.year.min()), int(basedata.year.max()) + 1 ) ] + largeclu = list(basedata.groupby('cluster').sum().query(f'counts > {minClusterSize}').index) cluDict = {} for clu in basedata.cluster.unique(): - cluvec = [] - basedf = basedata.query('cluster == @clu') - baseyears = basedf.year.unique() - for year in yearbase: - if year in baseyears: - cluvec.append(basedf.query('year == @year').counts.iloc[0]) - else: - cluvec.append(0) - cluDict[clu] = cluvec - - fig, ax = plt.subplots(figsize=(10, 7)) - ax.stackplot( - yearbase, - cluDict.values(), - labels=cluDict.keys(), - baseline='sym' - ) + if clu in largeclu: + cluvec = [] + basedf = basedata.query('cluster == @clu') + baseyears = list(basedf.year.unique()) + for year in yearbase: + if year in baseyears: + cluvec.append(basedf.query('year == @year').counts.iloc[0]) + else: + cluvec.append(0) + cluDict[clu] = cluvec + + fig, ax = plt.subplots(figsize=(16, 9)) + if type(smooth) is float: + grid = np.linspace(yearbase[0], yearbase[-1], num=100) + y = [np.array(x) for x in cluDict.values()] + y_smoothed = [gaussian_smooth(yearbase, y_, grid, smooth) for y_ in y] + ax.stackplot( + grid, + y_smoothed, + labels=cluDict.keys(), + baseline="sym" + ,colors=plt.get_cmap('tab20').colors + ) + + pass + else: + ax.stackplot( + yearbase, + cluDict.values(), + labels=cluDict.keys(), + baseline='sym', + colors=plt.get_cmap('tab20').colors + ) + ax.legend() ax.set_title('Cluster sizes') ax.set_xlabel('Year') ax.set_ylabel('Number of publications') - ax.axhline(0, color="black", ls="--") - + ax.yaxis.set_ticklabels([]) + ax.xaxis.grid(color='gray') + temp = ax.xaxis.get_ticklabels() + temp = list(set(temp) - set(temp[::showNthGrid])) + for label in temp: + label.set_visible(False) + ax.set_axisbelow(True) + #plt.show() return fig diff --git a/tox.ini b/tox.ini index ea55a51..389f70c 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37,py38 +envlist = py39 isolated_build = True [pytest] From 8a1208f68f7181a29f6cb97366dfb8c65696750f Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Fri, 17 Dec 2021 15:08:44 +0100 Subject: [PATCH 15/53] upd origin, wip on reports multiprocessing --- src/semanticlayertools/clustering/leiden.py | 9 ++-- src/semanticlayertools/clustering/reports.py | 50 +++++++++++++++----- src/semanticlayertools/visual/utils.py | 1 - 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py index 31b43ea..f0045ad 100644 --- a/src/semanticlayertools/clustering/leiden.py +++ b/src/semanticlayertools/clustering/leiden.py @@ -18,6 +18,7 @@ def __init__( self, inpath: str, outpath: str, resolution: float = 0.003, intersliceCoupling: float = 0.4, timerange: tuple = (1945, 2005), + debug: debugVar = False ): starttime = time.time() @@ -33,7 +34,7 @@ def __init__( f'timeclusters_{timerange[0]}-{timerange[1]}_res_{resolution}_intersl_{intersliceCoupling}.csv' ) if os.path.isfile(self.outfile): - raise OSError('Output file exists. Please remove.') + raise OSError(f'Output file at {self.outfile} exists. Aborting.') edgefiles = [x for x in os.listdir(inpath) if x.endswith('_GC.net')] @@ -57,7 +58,7 @@ def __init__( f"loaded in {time.time() - starttime} seconds." ) - def optimize(self): + def optimize(self, clusterSizeCompare: int=1000): """Optimize clusters accross time slices.""" starttime = time.time() @@ -109,10 +110,10 @@ def optimize(self): outfile.write( f"{elem[0]},{elem[1]},{elem[2]}\n" ) - + largeclu = [(x,len(x.vs)) for x in subgraphs if len(x.vs)>clusterSizeCompare] print( f'Finished in {time.time() - starttime} seconds.' - f"Found {len(subgraphs)} clusters." + f"Found {len(subgraphs)} clusters, with {len(largeclu)} larger then {clusterSizeCompare} nodes." ) return commun diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py index 5110510..40b7b35 100644 --- a/src/semanticlayertools/clustering/reports.py +++ b/src/semanticlayertools/clustering/reports.py @@ -6,6 +6,7 @@ import textacy import textacy.tm import pandas as pd +import numpy as np import multiprocessing num_processes = multiprocessing.cpu_count() @@ -17,22 +18,28 @@ class ClusterReports(): def __init__( - self, infile:str, metadatapath:str, outpath:str, iteration: int, + self, infile:str, metadatapath:str, outpath:str, numberProc: int=num_processes, minClusterSize: int=1000 ): - self.iteration = iteration self.numberProc = numberProc self.minClusterSize = minClusterSize self.metadatapath = metadatapath - self.outpath = outpath - self.clusterdf = pd.read_csv(infile) basedata = self.clusterdf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index() - largeClusterList = list(basedata.groupby('cluster').sum().query(f'counts > {self.minClusterSize}').index) - + largeClusterList = list( + basedata.groupby('cluster').sum().query(f'counts > {self.minClusterSize}').index + ) self.clusternodes = self.clusterdf.query( 'cluster in @largeClusterList' ) + outfolder = infile.split(os.path.sep)[-1].split('.')[0] + self.outpath = os.path.join(outpath, outfolder) + if os.path.isdir(self.outpath): + raise OSError(f'Output folder {self.outpath} exists. Aborting.') + else: + os.mkdir(self.outpath) + for clu in largeClusterList: + os.mkdir(os.path.join(self.outpath, f'Cluster_{clu}')) def create_corpus(self, dataframe): """Create corpus out of dataframe.""" @@ -161,9 +168,28 @@ def fullReport(self, cluster): ) return cluster - def processClusters(self, publicationIDcolumn: str='nodeID'): - for filename in tqdm(os.listdir(self.metadatapath)): - filepath = os.path.join(self.metadatapath, filename) - data = pd.read_json(filepath, lines=True) - selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner') - selectMerge.to_json(os.path.join(self.outpath, 'merge_' + filename) , orient='records', lines=True) + def _mergeData(self, filename, publicationIDcolumn: str='nodeID'): + filepath = os.path.join(self.metadatapath, filename) + data = pd.read_json(filepath, lines=True) + selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner') + if selectMerge.shape[0]>0: + for clu, g0 in selectMerge.groupby('cluster'): + g0.to_json(os.path.join(self.outpath, f'Cluster_{clu}', 'merged_' + filename) , orient='records', lines=True) + self.pbar.update(1) + return + + def gatherClusterMetadata(self): + filenames = os.listdir(self.metadatapath) + #chunk_size = int(len(filenames) / self.numberProc) + #chunks = np.array_split(filenames, chunk_size) + self.pbar = tqdm(len(filenames)) + pool = multiprocessing.Pool(self.numberProc) + result = pool.map(self._mergeData, filenames, chunksize=int(len(filenames) / self.numberProc)) + return + + # filepath = os.path.join(self.metadatapath, filename) + # data = pd.read_json(filepath, lines=True) + # selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner') + # if selectMerge.shape[0]>0: + # for clu, g0 in selectMerge.groupby('cluster'): + # g0.to_json(os.path.join(self.outpath, f'Cluster_{clu}', 'merged_' + filename) , orient='records', lines=True) diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py index 0f79bdf..b7fc4aa 100644 --- a/src/semanticlayertools/visual/utils.py +++ b/src/semanticlayertools/visual/utils.py @@ -72,5 +72,4 @@ def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000 for label in temp: label.set_visible(False) ax.set_axisbelow(True) - #plt.show() return fig From 3f6769f01b8ea06283154a50ec02af9420ade5f7 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 20 Dec 2021 16:24:38 +0100 Subject: [PATCH 16/53] finish mp reporting, add pipeline --- src/semanticlayertools/clustering/leiden.py | 3 +- src/semanticlayertools/clustering/reports.py | 211 +++++++++--------- src/semanticlayertools/linkage/cocitation.py | 12 +- .../pipelines/cocitetimeclusters.py | 50 +++++ 4 files changed, 163 insertions(+), 113 deletions(-) create mode 100644 src/semanticlayertools/pipelines/cocitetimeclusters.py diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py index f0045ad..4721783 100644 --- a/src/semanticlayertools/clustering/leiden.py +++ b/src/semanticlayertools/clustering/leiden.py @@ -18,7 +18,6 @@ def __init__( self, inpath: str, outpath: str, resolution: float = 0.003, intersliceCoupling: float = 0.4, timerange: tuple = (1945, 2005), - debug: debugVar = False ): starttime = time.time() @@ -116,4 +115,4 @@ def optimize(self, clusterSizeCompare: int=1000): f"Found {len(subgraphs)} clusters, with {len(largeclu)} larger then {clusterSizeCompare} nodes." ) - return commun + return self.outfile, commun diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py index 40b7b35..cb3cca3 100644 --- a/src/semanticlayertools/clustering/reports.py +++ b/src/semanticlayertools/clustering/reports.py @@ -1,5 +1,8 @@ import re import os +import time +import multiprocessing +from collections import Counter from tqdm import tqdm import spacy @@ -7,7 +10,7 @@ import textacy.tm import pandas as pd import numpy as np -import multiprocessing +import warnings num_processes = multiprocessing.cpu_count() @@ -18,34 +21,36 @@ class ClusterReports(): def __init__( - self, infile:str, metadatapath:str, outpath:str, - numberProc: int=num_processes, minClusterSize: int=1000 + self, infile: str, metadatapath: str, outpath: str, + numberProc: int = num_processes, minClusterSize: int = 1000, + timerange: tuple = (1945, 2005) ): self.numberProc = numberProc self.minClusterSize = minClusterSize self.metadatapath = metadatapath - self.clusterdf = pd.read_csv(infile) - basedata = self.clusterdf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index() - largeClusterList = list( + clusterdf = pd.read_csv(infile) + basedata = clusterdf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index() + self.largeClusterList = list( basedata.groupby('cluster').sum().query(f'counts > {self.minClusterSize}').index ) - self.clusternodes = self.clusterdf.query( - 'cluster in @largeClusterList' + self.clusternodes = clusterdf.query( + 'cluster in @self.largeClusterList' ) - outfolder = infile.split(os.path.sep)[-1].split('.')[0] + outfolder = infile.split(os.path.sep)[-1][:-4] + self.timerange = timerange self.outpath = os.path.join(outpath, outfolder) if os.path.isdir(self.outpath): raise OSError(f'Output folder {self.outpath} exists. Aborting.') else: os.mkdir(self.outpath) - for clu in largeClusterList: + for clu in self.largeClusterList: os.mkdir(os.path.join(self.outpath, f'Cluster_{clu}')) def create_corpus(self, dataframe): """Create corpus out of dataframe.""" docs = [] titles = [x[0] for x in dataframe.title.values if type(x) == list] - for title in tqdm(titles): + for title in tqdm(titles, leave=False): try: # text pre-processing title = re.sub("\n", " ", title) @@ -66,8 +71,9 @@ def create_corpus(self, dataframe): corpus_titles = textacy.Corpus(mainLanguageCorp, data=docs) return corpus_titles - - def find_topics(self, corpus_titles: list, n_topics: int, top_words: int, outpath:str='./', writeReport: bool=False): + def find_topics( + self, corpus_titles: list, n_topics: int, top_words: int, + ): """Calculate topics in corpus.""" vectorizer = textacy.representations.vectorizers.Vectorizer( tf_type="linear", @@ -86,110 +92,103 @@ def find_topics(self, corpus_titles: list, n_topics: int, top_words: int, outpat model = textacy.tm.TopicModel("nmf", n_topics) model.fit(doc_term_matrix) - doc_topic_matrix = model.transform(doc_term_matrix) - topics = [] for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_words): topics.append("topic " + str(topic_idx) + ": " + " ".join(top_terms)) - if writeReport is True: - outfile.write(f'\n\n\tTopics in cluster for {n_topics} topics:\n') - for topic in topics: - outfile.write(f'\t\t{topic}\n') - else: - print("\nTopics in the cluster:\n") - return topics + outtext = f'\n\n\tTopics in cluster for {n_topics} topics:\n' + for topic in topics: + outtext += f'\t\t{topic}\n' + return outtext def fullReport(self, cluster): """Generate full cluster report.""" - with open(f'{outFolderReports}Report_{cluster}.txt', 'a') as outfile: - selection = self.clusterdf.query('cluster == @cluster') - nodeList = list(set(selection.node.values)) - starttime = time.time() - result = {} - resultMeta = [] - for key, vals in groupby(sorted(nodeList), lambda x: x[:4]): - result[int(key)] = list(vals) - - for key in result.keys(): - if key > 1949 and key < 2005 and key != 1996: - yeardata = pd.read_json(f'{inFolderMetadata}{key}_meta.json', lines=True) - selectNodedata = yeardata[yeardata.nodeID.isin(result[key])] - resultMeta.append(selectNodedata) - - metadata = pd.concat(resultMeta) - metadata.to_json( - f'{outFolderReports}meta/cluster_{cluster}_meta.json', - orient='records', - lines=True - ) - foundNodes = [x[0] for x in metadata.bibcode.values] - notFound = [x for x in nodeList if x not in foundNodes] - - outfile.write( - f'\tGot {len(nodeList)} unique publications in time range:\ - {selection.year.min()} to {selection.year.max()}.\n' - ) - outfile.write( - f'\t\tFound metadata for {metadata.shape[0]} publications.\n' - ) - outfile.write( - f'\t\tThere are {len([x for x in foundNodes if x not in nodeList])}\ - found publications which where NOT in the query list.\n' - ) - outfile.write( - f'\t\tThere are {len(notFound)} publication(s) which where NOT found:\n' - ) - - topAuthors = Counter( - [x for y in [x for x in metadata.author.values if type(x) == list] for x in y] - ).most_common(20) - outfile.write('\n\tThe top authors of this cluster are:\n') - for elem in topAuthors: - outfile.write(f'\t\t{elem[0]}: {elem[1]} pubs\n.') - topAffils = Counter( - [x for y in [x for x in metadata.aff.values if type(x) == list] for x in y] - ).most_common(20) - outfile.write('\n\tThe top 20 affiliations of this cluster are:\n') - for elem in topAffils: - outfile.write(f'\t\t{elem[0]}: {elem[1]} authors.\n') - outfile.write( - f'\n\n\tFinished analysis of cluster {cluster} with {len(nodeList)}\ - unique publications in {time.time()- starttime} seconds.\n\n' - ) - corpus = create_corpus(metadata) - find_topics( - corpus, n_topics=15, top_words=10, writeReport=True, outfile=outfile - ) - find_topics( - corpus, n_topics=50, top_words=10, writeReport=True, outfile=outfile - ) - outfile.write( - f'\n\n\tFinished analysis of topics in {cluster} in {time.time()- starttime} seconds.\n\n' - ) - return cluster - - def _mergeData(self, filename, publicationIDcolumn: str='nodeID'): + starttime = time.time() + clusterpath = os.path.join(self.outpath, f'Cluster_{cluster}') + clusterfiles = os.listdir(clusterpath) + clusterdf = [] + for x in clusterfiles: + try: + clusterdf.append( + pd.read_json(os.path.join(clusterpath, x), lines=True) + ) + except ValueError: + raise + dfCluster = pd.concat(clusterdf, ignore_index=True) + basedf = self.clusternodes.query('cluster == @cluster') + inputnodes = basedf.node.values + foundNodes = [x[0] for x in dfCluster.bibcode.values] + notFound = [x for x in inputnodes if x not in foundNodes] + topAuthors = Counter( + [x for y in [x for x in dfCluster.author.values if type(x) == list] for x in y] + ).most_common(20) + authortext = '' + for x in topAuthors: + authortext += f'\t{x[0]}: {x[1]}\n' + topAffils = Counter( + [x for y in [x for x in dfCluster.aff.values if type(x) == list] for x in y] + ).most_common(21) + affiltext = '' + for x in topAffils[1:]: + affiltext += f'\t{x[0]}: {x[1]}\n' + corpus = self.create_corpus(dfCluster) + warnings.simplefilter(action='ignore', category=FutureWarning) + topics_15 = self.find_topics(corpus, n_topics=15, top_words=20) + topics_50 = self.find_topics(corpus, n_topics=50, top_words=20) + outtext = f"""Report for Cluster {cluster} + +Got {len(inputnodes)} unique publications in time range: {basedf.year.min()} to {basedf.year.max()}. + Found metadata for {dfCluster.shape[0]} publications. + There are {len(notFound)} publications without metadata. + + The top 20 authors of this cluster are: + {authortext} + + The top 20 affiliations of this cluster are: + {affiltext} + + {topics_15} + + {topics_50} + +Finished analysis of cluster {cluster} in {time.time()- starttime} seconds.""" + return outtext + + def _mergeData(self, filename, publicationIDcolumn: str = 'nodeID'): filepath = os.path.join(self.metadatapath, filename) data = pd.read_json(filepath, lines=True) - selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner') - if selectMerge.shape[0]>0: + selectMerge = data.merge( + self.clusternodes, + left_on=publicationIDcolumn, + right_on='node', + how='inner' + ) + if selectMerge.shape[0] > 0: for clu, g0 in selectMerge.groupby('cluster'): - g0.to_json(os.path.join(self.outpath, f'Cluster_{clu}', 'merged_' + filename) , orient='records', lines=True) - self.pbar.update(1) - return + g0.to_json( + os.path.join( + self.outpath, + f'Cluster_{clu}', + 'merged_' + filename + ), orient='records', lines=True + ) + return '' def gatherClusterMetadata(self): filenames = os.listdir(self.metadatapath) - #chunk_size = int(len(filenames) / self.numberProc) - #chunks = np.array_split(filenames, chunk_size) - self.pbar = tqdm(len(filenames)) - pool = multiprocessing.Pool(self.numberProc) - result = pool.map(self._mergeData, filenames, chunksize=int(len(filenames) / self.numberProc)) + yearFiles = [] + for x in filenames: + try: + year = int(re.findall(r'\d{4}', x)[0]) + except: + raise + if self.timerange[0] <= year <= self.timerange[1]: + yearFiles.append(x) + with multiprocessing.Pool(self.numberProc) as pool: + _ = pool.map(self._mergeData, tqdm(yearFiles, leave=False)) return - # filepath = os.path.join(self.metadatapath, filename) - # data = pd.read_json(filepath, lines=True) - # selectMerge = data.merge(self.clusternodes, left_on=publicationIDcolumn, right_on='node', how='inner') - # if selectMerge.shape[0]>0: - # for clu, g0 in selectMerge.groupby('cluster'): - # g0.to_json(os.path.join(self.outpath, f'Cluster_{clu}', 'merged_' + filename) , orient='records', lines=True) + def writeReports(self): + for cluster in tqdm(self.largeClusterList): + outtext = self.fullReport(cluster) + with open(f'{self.outpath}Cluster_{cluster}.txt', 'w') as file: + file.write(outtext) diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py index a78a745..143b0df 100644 --- a/src/semanticlayertools/linkage/cocitation.py +++ b/src/semanticlayertools/linkage/cocitation.py @@ -16,12 +16,14 @@ limitRefLength = TypeVar('limitRefLength', bool, int) debugVar = TypeVar('debugVar', bool, str) + class Cocitations(): """Create cocitation networks.""" def __init__( self, inpath, outpath, columnName, - numberProc: int=num_processes, limitRefLength: limitRefLength=False, debug: debugVar=False, + numberProc: int = num_processes, limitRefLength: limitRefLength = False, + debug: debugVar = False, ): self.inpath = inpath self.outpath = outpath @@ -35,7 +37,7 @@ def getCombinations(self, chunk): res = [] if type(self.limitRefLength) == int: reflen = chunk[self.columnName].apply( - lambda x: True if type(x)==list and len(x)<=self.limitRefLength else False + lambda x: True if type(x) == list and len(x) <= self.limitRefLength else False ) data = chunk[reflen].copy() else: @@ -65,7 +67,7 @@ def calculateCoCitation(self, filepath): sortedComponents = sorted( [(x, len(x), len(x)*100/len(tempG.vs)) for x in components], key=lambda x: x[1], reverse=True ) - with open(os.path.join(self.outpath,infilename + '_graphMetadata.txt'), 'w') as outfile: + with open(os.path.join(self.outpath, infilename + '_graphMetadata.txt'), 'w') as outfile: outfile.write(f'Graph derived from {filepath}\nSummary:\n') outfile.write(tempG.summary() + '\n\nComponents (ordered by size):\n\n') for idx, elem in enumerate(sortedComponents): @@ -76,9 +78,9 @@ def calculateCoCitation(self, filepath): giantComponent = sortedComponents[0] giantComponentGraph = tempG.vs.select(giantComponent[0]).subgraph() giantComponentGraph.write_pajek( - os.path.join(self.outpath,infilename + '_GC.net') + os.path.join(self.outpath, infilename + '_GC.net') ) - with open(os.path.join(self.outpath,infilename + '.ncol'), 'w') as outfile: + with open(os.path.join(self.outpath, infilename + '.ncol'), 'w') as outfile: for edge in sortCoCitCounts: outfile.write(f"{edge[0]} {edge[1]} {edge[2]}\n") except: diff --git a/src/semanticlayertools/pipelines/cocitetimeclusters.py b/src/semanticlayertools/pipelines/cocitetimeclusters.py new file mode 100644 index 0000000..8ce6027 --- /dev/null +++ b/src/semanticlayertools/pipelines/cocitetimeclusters.py @@ -0,0 +1,50 @@ +"""Runs all steps to create reports for cocite temporal network clustering.""" +import tempfile +from datetime import datetime +import os +import multiprocessing + +from ..linkage.cocitation import Cocitations +from ..clustering.leiden import TimeCluster +from ..clustering.reports import ClusterReports + +num_processes = multiprocessing.cpu_count() + + +def run( + basepath, + cociteOutpath, + timeclusterOutpath, + reportsOutpath, + resolution, + intersliceCoupling, + minClusterSize: int = 1000, + timerange=(1945, 2005), + referenceColumnName: str = 'reference', + numberproc: int = num_processes, + limitRefLength=False, debug=False +): + cocites = Cocitations( + basepath, cociteOutpath, referenceColumnName, limitRefLength, debug + ) + cocites.processFolder() + timeclusters = TimeCluster( + inpath=cociteOutpath, + outpath=timeclusterOutpath, + resolution=resolution, + intersliceCoupling=intersliceCoupling, + timerange=timerange, + debug=debug + ) + timeclfile, _ = timeclusters.optimize() + clusterreports = ClusterReports( + infile=timeclfile, + metadatapath=basepath, + outpath=reportsOutpath, + numberProc=numberproc, + minClusterSize=minClusterSize, + timerange=(timerange[0], timerange[1] + 3) + ) + clusterreports.gatherClusterMetadata() + clusterreports.writeReports() + print('Done') From c4b60dfd2406871d76ea5a9ee7a51679c635309d Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Tue, 21 Dec 2021 15:00:32 +0100 Subject: [PATCH 17/53] finish pipeline, minor improv of reportings --- src/semanticlayertools/clustering/leiden.py | 8 ++-- src/semanticlayertools/clustering/reports.py | 17 +++++--- src/semanticlayertools/linkage/cocitation.py | 43 +++++++++++++++---- .../pipelines/cocitetimeclusters.py | 32 ++++++++------ 4 files changed, 69 insertions(+), 31 deletions(-) diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py index 4721783..c695d21 100644 --- a/src/semanticlayertools/clustering/leiden.py +++ b/src/semanticlayertools/clustering/leiden.py @@ -39,7 +39,7 @@ def __init__( self.graphDict = {} - for idx in tqdm(range(len(edgefiles))): + for idx in tqdm(range(len(edgefiles)), leave=False): try: year = re.findall(r'\d{4}', edgefiles[idx])[0] except: @@ -66,7 +66,7 @@ def optimize(self, clusterSizeCompare: int=1000): interslice_weight=self.interslice_param, vertex_id_attr='name' ) - print('\tHave set layers.') + print('\tSet layers.') partitions = [ la.CPMVertexPartition( @@ -76,7 +76,7 @@ def optimize(self, clusterSizeCompare: int=1000): resolution_parameter=self.res_param ) for H in layers ] - print('\tHave set partitions.') + print('\tSet partitions.') interslice_partition = la.CPMVertexPartition( interslice_layer, @@ -84,7 +84,7 @@ def optimize(self, clusterSizeCompare: int=1000): node_sizes='node_size', weights='weight' ) - print('\tHave set interslice partions.') + print('\tSet interslice partions.') self.optimiser.optimise_partition_multiplex( partitions + [interslice_partition] diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py index cb3cca3..a4b27c6 100644 --- a/src/semanticlayertools/clustering/reports.py +++ b/src/semanticlayertools/clustering/reports.py @@ -100,7 +100,9 @@ def find_topics( outtext += f'\t\t{topic}\n' return outtext - def fullReport(self, cluster): + def fullReport(self, cluster, authorColumnName: str = 'author', + affiliationColumnName: str = 'aff' + ): """Generate full cluster report.""" starttime = time.time() clusterpath = os.path.join(self.outpath, f'Cluster_{cluster}') @@ -115,21 +117,21 @@ def fullReport(self, cluster): raise dfCluster = pd.concat(clusterdf, ignore_index=True) basedf = self.clusternodes.query('cluster == @cluster') - inputnodes = basedf.node.values - foundNodes = [x[0] for x in dfCluster.bibcode.values] - notFound = [x for x in inputnodes if x not in foundNodes] + inputnodes = set(basedf.node.values) + notFound = inputnodes.difference(set(dfCluster.nodeID.values)) topAuthors = Counter( - [x for y in [x for x in dfCluster.author.values if type(x) == list] for x in y] + [x for y in dfCluster[authorColumnName].fillna('').values for x in y] ).most_common(20) authortext = '' for x in topAuthors: authortext += f'\t{x[0]}: {x[1]}\n' topAffils = Counter( - [x for y in [x for x in dfCluster.aff.values if type(x) == list] for x in y] + [x for y in dfCluster[affiliationColumnName].fillna('').values for x in y] ).most_common(21) affiltext = '' for x in topAffils[1:]: affiltext += f'\t{x[0]}: {x[1]}\n' + print(f'\tFinished base report for cluster {cluster}.') corpus = self.create_corpus(dfCluster) warnings.simplefilter(action='ignore', category=FutureWarning) topics_15 = self.find_topics(corpus, n_topics=15, top_words=20) @@ -151,6 +153,7 @@ def fullReport(self, cluster): {topics_50} Finished analysis of cluster {cluster} in {time.time()- starttime} seconds.""" + print('\t\tFinished topics.') return outtext def _mergeData(self, filename, publicationIDcolumn: str = 'nodeID'): @@ -188,7 +191,7 @@ def gatherClusterMetadata(self): return def writeReports(self): - for cluster in tqdm(self.largeClusterList): + for cluster in tqdm(self.largeClusterList, leave=False): outtext = self.fullReport(cluster) with open(f'{self.outpath}Cluster_{cluster}.txt', 'w') as file: file.write(outtext) diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py index 143b0df..98840fc 100644 --- a/src/semanticlayertools/linkage/cocitation.py +++ b/src/semanticlayertools/linkage/cocitation.py @@ -1,6 +1,7 @@ """Link documents by cocitation.""" import os import time +import re import multiprocessing from itertools import combinations from collections import Counter @@ -22,14 +23,17 @@ class Cocitations(): def __init__( self, inpath, outpath, columnName, - numberProc: int = num_processes, limitRefLength: limitRefLength = False, - debug: debugVar = False, + numberProc: int = num_processes, + limitRefLength: limitRefLength = False, + timerange: tuple = (1945, 2005), + debug: debugVar = False ): self.inpath = inpath self.outpath = outpath self.columnName = columnName self.numberProc = numberProc self.limitRefLength = limitRefLength + self.timerange = timerange self.debug = debug def getCombinations(self, chunk): @@ -75,6 +79,13 @@ def calculateCoCitation(self, filepath): outfile.write( f"{idx}:\n\t{elem[1]} nodes ({elem[2]:.3f}% of full graph)\n\t{len(gcompTemp.es)} edges ({len(gcompTemp.es)*100/len(tempG.es):.3f}% of full graph)\n\n" ) + if idx == 0: + gcouttuple = ( + elem[1], + elem[2], + len(gcompTemp.es), + len(gcompTemp.es)*100/len(tempG.es) + ) giantComponent = sortedComponents[0] giantComponentGraph = tempG.vs.select(giantComponent[0]).subgraph() giantComponentGraph.write_pajek( @@ -87,15 +98,31 @@ def calculateCoCitation(self, filepath): raise if self.debug == "l2": print(f'\tDone in {time.time() - starttime} seconds.') - return + return gcouttuple def processFolder(self): """Calculate cocitation for all files in folder.""" starttime = time.time() - for file in tqdm(os.listdir(self.inpath)): - try: - self.calculateCoCitation(os.path.join(self.inpath, file)) - except: - raise + with open( + os.path.join( + self.outpath, 'Giant_Component_properties.csv' + ), 'w' + ) as gcmetafile: + gcmetafile.write('year,nodes,nodespercent,edges,edgepercent\n') + for file in tqdm(os.listdir(self.inpath), leave=False): + try: + year = re.findall(r'\d{4}', file)[0] + except: + raise + if self.timerange[0] <= int(year) <= self.timerange[1]: + try: + outtuple = self.calculateCoCitation( + os.path.join(self.inpath, file) + ) + gcmetafile.write( + f'{year},{outtuple[0]},{outtuple[1]},{outtuple[2]},{outtuple[3]}\n' + ) + except: + raise if self.debug is True: print(f'\tDone in {time.time() - starttime} seconds.') diff --git a/src/semanticlayertools/pipelines/cocitetimeclusters.py b/src/semanticlayertools/pipelines/cocitetimeclusters.py index 8ce6027..1762507 100644 --- a/src/semanticlayertools/pipelines/cocitetimeclusters.py +++ b/src/semanticlayertools/pipelines/cocitetimeclusters.py @@ -1,6 +1,5 @@ """Runs all steps to create reports for cocite temporal network clustering.""" -import tempfile -from datetime import datetime +import time import os import multiprocessing @@ -12,20 +11,29 @@ def run( - basepath, - cociteOutpath, - timeclusterOutpath, - reportsOutpath, - resolution, - intersliceCoupling, + inputFilepath: str, + cociteOutpath: str, + timeclusterOutpath: str, + reportsOutpath: str, + resolution: float, + intersliceCoupling: float, minClusterSize: int = 1000, - timerange=(1945, 2005), + timerange: tuple = (1945, 2005), referenceColumnName: str = 'reference', numberproc: int = num_processes, limitRefLength=False, debug=False ): + for path in [cociteOutpath, timeclusterOutpath, reportsOutpath]: + os.makedirs(path) + starttime = time.time() cocites = Cocitations( - basepath, cociteOutpath, referenceColumnName, limitRefLength, debug + inpath=inputFilepath, + outpath=cociteOutpath, + columnName=referenceColumnName, + numberProc=numberproc, + limitRefLength=limitRefLength, + timerange=timerange, + debug=debug ) cocites.processFolder() timeclusters = TimeCluster( @@ -39,7 +47,7 @@ def run( timeclfile, _ = timeclusters.optimize() clusterreports = ClusterReports( infile=timeclfile, - metadatapath=basepath, + metadatapath=inputFilepath, outpath=reportsOutpath, numberProc=numberproc, minClusterSize=minClusterSize, @@ -47,4 +55,4 @@ def run( ) clusterreports.gatherClusterMetadata() clusterreports.writeReports() - print('Done') + print(f'Done after {time.time() - starttime} seconds.') From e4e81d9a75fc7469f6ea616cc23a9732554451b4 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Tue, 21 Dec 2021 16:19:05 +0100 Subject: [PATCH 18/53] add embedding utility fct --- setup.cfg | 21 ++++++++++- src/semanticlayertools/visual/utils.py | 52 +++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 9a67afc..6621ead 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,7 +19,7 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.7 +python_requires = >=3.8 install_requires = tqdm matplotlib @@ -33,5 +33,24 @@ install_requires = igraph leidenalg +[options.extras_require] +all = + %(embeddml)s + %(doc)s + %(dev)s + %(test)s +doc = + sphinx +dev = + twine + %(test)s +test = + tox +embeddml = + torch + umap-learn + sentence-transformers + plotly + [options.packages.find] where = src diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py index b7fc4aa..4558e68 100644 --- a/src/semanticlayertools/visual/utils.py +++ b/src/semanticlayertools/visual/utils.py @@ -1,8 +1,18 @@ +import os +from typing import TypeVar + import matplotlib.pyplot as plt import pandas as pd import numpy as np from scipy import stats -from typing import TypeVar + +from collections import Counter +import plotly.express as px +import plotly.graph_objects as go + +from sentence_transformers import SentenceTransformer, util +import umap +import torch smoothing = TypeVar('smoothing', bool, float) @@ -73,3 +83,43 @@ def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000 label.set_visible(False) ax.set_axisbelow(True) return fig + + +def embeddedText(infolderpath: str, columnName: str, outpath: str): + """Create embedding for corpus text.""" + print('Initializing embedder model.') + model = SentenceTransformer('all-MiniLM-L6-v2') + clusterfiles = os.listdir(infolderpath) + clusterdf = [] + for x in clusterfiles: + try: + clusterdf.append( + pd.read_json(os.path.join(infolderpath, x), lines=True) + ) + except ValueError: + raise + dataframe = pd.concat(clusterdf, ignore_index=True) + corpus = [x[0] for x in dataframe[columnName].fillna('').values if x] + print('Start embedding.') + corpus_embeddings = model.encode( + corpus, + convert_to_tensor=True + ) + torch.save( + corpus_embeddings, + f'{os.path.join(outpath, "embeddedCorpus.pt")}' + ) + print('\tDone\nStarting mapping to 2D.') + corpus_embeddings_2D = umap.UMAP( + n_neighbors=15, + n_components=2, + metric='cosine' + ).fit_transform(corpus_embeddings) + corpus_embeddings_2D.tofile( + f'{os.path.join(outpath, "embeddedCorpus_2d.csv")}', + sep=',' + ) + print('\tDone.') + dataframe.insert(0, 'x', corpus_embeddings_2D[:, 0]) + dataframe.insert(0, 'y', corpus_embeddings_2D[:, 1]) + return dataframe From e1459bf3790384feca8876694190b92a3f17360e Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Wed, 22 Dec 2021 12:55:43 +0100 Subject: [PATCH 19/53] rm not necess imports --- src/semanticlayertools/visual/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py index 4558e68..f2c8732 100644 --- a/src/semanticlayertools/visual/utils.py +++ b/src/semanticlayertools/visual/utils.py @@ -10,7 +10,7 @@ import plotly.express as px import plotly.graph_objects as go -from sentence_transformers import SentenceTransformer, util +from sentence_transformers import SentenceTransformer import umap import torch From 23434304af90578985b9093a9a9b064f57620ec6 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 3 Jan 2022 14:09:36 +0100 Subject: [PATCH 20/53] fix csv export of embeddings --- src/semanticlayertools/visual/utils.py | 40 ++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py index f2c8732..63defe3 100644 --- a/src/semanticlayertools/visual/utils.py +++ b/src/semanticlayertools/visual/utils.py @@ -85,7 +85,7 @@ def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000 return fig -def embeddedText(infolderpath: str, columnName: str, outpath: str): +def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str): """Create embedding for corpus text.""" print('Initializing embedder model.') model = SentenceTransformer('all-MiniLM-L6-v2') @@ -99,7 +99,8 @@ def embeddedText(infolderpath: str, columnName: str, outpath: str): except ValueError: raise dataframe = pd.concat(clusterdf, ignore_index=True) - corpus = [x[0] for x in dataframe[columnName].fillna('').values if x] + dataframe = dataframe.dropna(subset=[columnName], axis=0) + corpus = [x[0] for x in dataframe[columnName].values if x] print('Start embedding.') corpus_embeddings = model.encode( corpus, @@ -115,10 +116,37 @@ def embeddedText(infolderpath: str, columnName: str, outpath: str): n_components=2, metric='cosine' ).fit_transform(corpus_embeddings) - corpus_embeddings_2D.tofile( - f'{os.path.join(outpath, "embeddedCorpus_2d.csv")}', - sep=',' - ) + np.savetxt(os.path.join(outpath, "embeddedCorpus_2d.csv"), corpus_embeddings_2D, delimiter=',', newline='\n') + print('\tDone.') + dataframe.insert(0, 'x', corpus_embeddings_2D[:, 0]) + dataframe.insert(0, 'y', corpus_embeddings_2D[:, 1]) + return dataframe + + +def embeddedTextClustering(infolderpath: str, columnName: str, emdeddingspath: str, outpath: str): + """Create clustering based on embedding for corpus texts.""" + print('Initializing embedder model.') + model = SentenceTransformer('all-MiniLM-L6-v2') + clusterfiles = os.listdir(infolderpath) + clusterdf = [] + for x in clusterfiles: + try: + clusterdf.append( + pd.read_json(os.path.join(infolderpath, x), lines=True) + ) + except ValueError: + raise + dataframe = pd.concat(clusterdf, ignore_index=True) + corpus = [x[0] for x in dataframe[columnName].fillna('').values if x] + print('Loading embedding.') + corpus_embeddings = torch.load(embeddingspath) + print('\tDone\nStarting mapping to lower dimensions.') + corpus_embeddings = umap.UMAP( + n_neighbors=15, + n_components=50, + metric='cosine' + ).fit_transform(corpus_embeddings) + np.savetxt(os.path.join(outpath, "embeddedCorpus_50d.csv"), corpus_embeddings_2D, delimiter=',', newline='\n') print('\tDone.') dataframe.insert(0, 'x', corpus_embeddings_2D[:, 0]) dataframe.insert(0, 'y', corpus_embeddings_2D[:, 1]) From 9ed19a89cdadd99fe5d3a24452ca32c09cae8655 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 3 Jan 2022 19:17:19 +0100 Subject: [PATCH 21/53] add util for clustering --- setup.cfg | 1 + src/semanticlayertools/visual/utils.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/setup.cfg b/setup.cfg index 6621ead..a328a5e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,6 +49,7 @@ test = embeddml = torch umap-learn + hdbscan sentence-transformers plotly diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py index 63defe3..725fc30 100644 --- a/src/semanticlayertools/visual/utils.py +++ b/src/semanticlayertools/visual/utils.py @@ -12,6 +12,7 @@ from sentence_transformers import SentenceTransformer import umap +import hdbscan import torch smoothing = TypeVar('smoothing', bool, float) @@ -126,7 +127,6 @@ def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str): def embeddedTextClustering(infolderpath: str, columnName: str, emdeddingspath: str, outpath: str): """Create clustering based on embedding for corpus texts.""" print('Initializing embedder model.') - model = SentenceTransformer('all-MiniLM-L6-v2') clusterfiles = os.listdir(infolderpath) clusterdf = [] for x in clusterfiles: @@ -137,17 +137,22 @@ def embeddedTextClustering(infolderpath: str, columnName: str, emdeddingspath: s except ValueError: raise dataframe = pd.concat(clusterdf, ignore_index=True) - corpus = [x[0] for x in dataframe[columnName].fillna('').values if x] + dataframe = dataframe.dropna(subset=[columnName], axis=0) + corpus = [x[0] for x in dataframe[columnName].values if x] print('Loading embedding.') corpus_embeddings = torch.load(embeddingspath) print('\tDone\nStarting mapping to lower dimensions.') - corpus_embeddings = umap.UMAP( + corpus_embeddings_50D = umap.UMAP( n_neighbors=15, n_components=50, metric='cosine' ).fit_transform(corpus_embeddings) - np.savetxt(os.path.join(outpath, "embeddedCorpus_50d.csv"), corpus_embeddings_2D, delimiter=',', newline='\n') - print('\tDone.') - dataframe.insert(0, 'x', corpus_embeddings_2D[:, 0]) - dataframe.insert(0, 'y', corpus_embeddings_2D[:, 1]) + np.savetxt(os.path.join(outpath, "embeddedCorpus_50d.csv"), corpus_embeddings_50D, delimiter=',', newline='\n') + print('\tDone.\nStarting clustering.') + cluster = hdbscan.HDBSCAN( + min_cluster_size=20, + metric='euclidean', + cluster_selection_method='eom' + ).fit(corpus_embeddings_50D) + dataframe.insert(0, 'label', cluster.labels_) return dataframe From f8b297a4356a976d89c8fee04557d556cb9913a7 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Tue, 4 Jan 2022 13:55:46 +0100 Subject: [PATCH 22/53] improve docs --- docs/clustering.rst | 15 ++++++ docs/index.rst | 6 ++- docs/linkage.rst | 5 ++ docs/pipelines.rst | 11 +++++ docs/visual.rst | 6 +++ src/semanticlayertools/linkage/cocitation.py | 48 ++++++++++++++++++-- src/semanticlayertools/linkage/wordscore.py | 8 ++-- src/semanticlayertools/visual/utils.py | 21 +++++++-- tox.ini | 10 +++- 9 files changed, 114 insertions(+), 16 deletions(-) create mode 100644 docs/clustering.rst create mode 100644 docs/pipelines.rst create mode 100644 docs/visual.rst diff --git a/docs/clustering.rst b/docs/clustering.rst new file mode 100644 index 0000000..62d1dc8 --- /dev/null +++ b/docs/clustering.rst @@ -0,0 +1,15 @@ +Clustering network data +======================= + +.. automodule:: semanticlayertools.clustering.infomap + :members: + :undoc-members: + + +.. automodule:: semanticlayertools.clustering.leiden + :members: + :undoc-members: + +.. automodule:: semanticlayertools.clustering.reports + :members: + :undoc-members: diff --git a/docs/index.rst b/docs/index.rst index 747681b..a445cda 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,8 +12,12 @@ This project collects tools to build semantic layers from text corpora. :maxdepth: 2 :caption: Contents: - linkage cleaning + pipelines + linkage + clustering + visual + diff --git a/docs/linkage.rst b/docs/linkage.rst index 441a5d1..3e65b98 100644 --- a/docs/linkage.rst +++ b/docs/linkage.rst @@ -4,3 +4,8 @@ Word scoring and linkage .. automodule:: semanticlayertools.linkage.wordscore :members: :undoc-members: + + +.. automodule:: semanticlayertools.linkage.cocitation + :members: + :undoc-members: diff --git a/docs/pipelines.rst b/docs/pipelines.rst new file mode 100644 index 0000000..0c2ff90 --- /dev/null +++ b/docs/pipelines.rst @@ -0,0 +1,11 @@ +Pipelines for workflows +======================= + +.. automodule:: semanticlayertools.pipelines.cocitetimeclusters + :members: + :undoc-members: + + +.. automodule:: semanticlayertools.pipelines.wordscorenet + :members: + :undoc-members: diff --git a/docs/visual.rst b/docs/visual.rst new file mode 100644 index 0000000..29e6a31 --- /dev/null +++ b/docs/visual.rst @@ -0,0 +1,6 @@ +Utility functions for visualizations +==================================== + +.. automodule:: semanticlayertools.visual.utils + :members: + :undoc-members: diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py index 98840fc..2872b33 100644 --- a/src/semanticlayertools/linkage/cocitation.py +++ b/src/semanticlayertools/linkage/cocitation.py @@ -19,7 +19,33 @@ class Cocitations(): - """Create cocitation networks.""" + """Create cocitation networks. + + Calculates all combinations of all references of publications in given + corpus file(s). Can be limited for maximal number of references to consider + (e.g. papers with less then 200 references), to speed up creation of + networks. + + For each corpus file, graphs are generated by the weighted cocitation tuples, + using the Igraph package. Information on obtained clusters are written to + '_graphMetadata.txt' files. The subgraph of the Giant component is saved in + Pajek format with the ending '_GC.net'. The full edge data is written in + edge-Format to a '.ncol' file. + + :param inpath: Path for input data + :type inpath: str + :param outpath: Path for writing output data + :type outpath: str + :param columnName: Column name containing the references of a publication + :type columnName: str + :param numberProc: Number of CPUs the package is allowed to use (default=all) + :type numberProc: int + :param limitRefLength: Either False or integer giving the maximum number of references a considered publication is allowed to contain + :type limitRefLength: bool or int + :param timerange: Time range to consider (default=(1945,2005)) + :type timerange: tuple + :param debug: False/True or l2 to show level 2 debugging messages + """ def __init__( self, inpath, outpath, columnName, @@ -37,7 +63,13 @@ def __init__( self.debug = debug def getCombinations(self, chunk): - """Calculate combinations.""" + """Calculate combinations of references in publications chunk. + + :param chunk: A chunk of the corpus dataframe + :type chunk: `pd.Dataframe` + :returns: A list of all reference combinations for each corpus entry + :rtype: list + """ res = [] if type(self.limitRefLength) == int: reflen = chunk[self.columnName].apply( @@ -53,7 +85,17 @@ def getCombinations(self, chunk): return res def calculateCoCitation(self, filepath): - """Do calculation for input file.""" + """Run calculation for single input file. + + Creates three files: Metadata-File with all components information, + Giant component network data in pajek format and full graph data in + edgelist format. + + :param filepath: Path for input corous + :type filepath: str + :returns: A tuple of GC information: Number of nodes and percentage of total, Number of edges and percentage of total + :rtype: tuple + """ infilename = filepath.split(os.path.sep)[-1].split('.')[0] starttime = time.time() try: diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py index 84bdaa7..e36721b 100644 --- a/src/semanticlayertools/linkage/wordscore.py +++ b/src/semanticlayertools/linkage/wordscore.py @@ -1,6 +1,5 @@ import os -import re -from collections import Counter, defaultdict +from collections import Counter from itertools import islice, combinations from multiprocessing import Pool, cpu_count from tqdm import tqdm @@ -61,7 +60,7 @@ def __init__( self.counts = {} self.corpussize = 1 self.uniqueNGrams = () - self.debug=debug + self.debug = debug def getTermPatterns(self): """Create dictionaries of occuring ngrams.""" @@ -151,8 +150,7 @@ class LinksOverTime(): This class takes care of this, by adding new keys of authors, papers or ngrams to the register. - :param dataframe: Source dataframe containing metadata of texts - (authors, publicationID and year) + :param dataframe: Source dataframe containing metadata of texts (authors, publicationID and year) :type dataframe: class:`pandas.DataFrame` :param authorColumn: Column name for author information :param pubIDColumn: Column name to identify publications diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py index 725fc30..c49aef8 100644 --- a/src/semanticlayertools/visual/utils.py +++ b/src/semanticlayertools/visual/utils.py @@ -117,14 +117,21 @@ def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str): n_components=2, metric='cosine' ).fit_transform(corpus_embeddings) - np.savetxt(os.path.join(outpath, "embeddedCorpus_2d.csv"), corpus_embeddings_2D, delimiter=',', newline='\n') + np.savetxt( + os.path.join(outpath, "embeddedCorpus_2d.csv"), + corpus_embeddings_2D, + delimiter=',', + newline='\n' + ) print('\tDone.') dataframe.insert(0, 'x', corpus_embeddings_2D[:, 0]) dataframe.insert(0, 'y', corpus_embeddings_2D[:, 1]) return dataframe -def embeddedTextClustering(infolderpath: str, columnName: str, emdeddingspath: str, outpath: str): +def embeddedTextClustering( + infolderpath: str, columnName: str, emdeddingspath: str, outpath: str +): """Create clustering based on embedding for corpus texts.""" print('Initializing embedder model.') clusterfiles = os.listdir(infolderpath) @@ -138,16 +145,20 @@ def embeddedTextClustering(infolderpath: str, columnName: str, emdeddingspath: s raise dataframe = pd.concat(clusterdf, ignore_index=True) dataframe = dataframe.dropna(subset=[columnName], axis=0) - corpus = [x[0] for x in dataframe[columnName].values if x] print('Loading embedding.') - corpus_embeddings = torch.load(embeddingspath) + corpus_embeddings = torch.load(emdeddingspath) print('\tDone\nStarting mapping to lower dimensions.') corpus_embeddings_50D = umap.UMAP( n_neighbors=15, n_components=50, metric='cosine' ).fit_transform(corpus_embeddings) - np.savetxt(os.path.join(outpath, "embeddedCorpus_50d.csv"), corpus_embeddings_50D, delimiter=',', newline='\n') + np.savetxt( + os.path.join(outpath, "embeddedCorpus_50d.csv"), + corpus_embeddings_50D, + delimiter=',', + newline='\n' + ) print('\tDone.\nStarting clustering.') cluster = hdbscan.HDBSCAN( min_cluster_size=20, diff --git a/tox.ini b/tox.ini index 389f70c..40887f1 100644 --- a/tox.ini +++ b/tox.ini @@ -17,9 +17,15 @@ commands = pytest {posargs} [testenv:docs] description = invoke sphinx-build to build the HTML docs -basepython = python3.7 +basepython = python3.9 deps = - sphinx >= 1.7.5, < 2 + sphinx sphinx_rtd_theme + plotly + hdbscan + umap-learn + torch + sentence-transformers + https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0.tar.gz#egg=en_core_web_lg commands = sphinx-build -d "{toxworkdir}/docs_doctree" docs "{toxworkdir}/docs_out" --color -W -bhtml {posargs} python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))' From 4849addc967b8076b40babfa56d4f9c720f34b88 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Wed, 5 Jan 2022 17:42:13 +0100 Subject: [PATCH 23/53] minor fixes --- src/semanticlayertools/visual/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py index c49aef8..5952052 100644 --- a/src/semanticlayertools/visual/utils.py +++ b/src/semanticlayertools/visual/utils.py @@ -100,8 +100,8 @@ def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str): except ValueError: raise dataframe = pd.concat(clusterdf, ignore_index=True) - dataframe = dataframe.dropna(subset=[columnName], axis=0) - corpus = [x[0] for x in dataframe[columnName].values if x] + dataframe = dataframe.dropna(subset=[columnName], axis=0).reset_index(drop=True) + corpus = [x[0] for x in dataframe[columnName].values] print('Start embedding.') corpus_embeddings = model.encode( corpus, @@ -144,7 +144,7 @@ def embeddedTextClustering( except ValueError: raise dataframe = pd.concat(clusterdf, ignore_index=True) - dataframe = dataframe.dropna(subset=[columnName], axis=0) + dataframe = dataframe.dropna(subset=[columnName], axis=0).reset_index(drop=True) print('Loading embedding.') corpus_embeddings = torch.load(emdeddingspath) print('\tDone\nStarting mapping to lower dimensions.') From c97976f61d1c343ceac04e0b27e4ba821daa64f9 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 6 Jan 2022 13:11:24 +0100 Subject: [PATCH 24/53] add doc --- docs/_static/bmbf.png | Bin 0 -> 12534 bytes docs/_static/logo.png | Bin 0 -> 13387 bytes docs/_static/mpiwg.png | Bin 0 -> 37002 bytes docs/clustering.rst | 7 +- docs/conf.py | 6 +- docs/index.rst | 16 ++- docs/linkage.rst | 5 +- src/semanticlayertools/clustering/infomap.py | 76 +++++++++--- src/semanticlayertools/clustering/leiden.py | 68 +++++++++-- src/semanticlayertools/clustering/reports.py | 116 ++++++++++++++++--- src/semanticlayertools/linkage/cocitation.py | 1 - 11 files changed, 247 insertions(+), 48 deletions(-) create mode 100644 docs/_static/bmbf.png create mode 100644 docs/_static/logo.png create mode 100644 docs/_static/mpiwg.png diff --git a/docs/_static/bmbf.png b/docs/_static/bmbf.png new file mode 100644 index 0000000000000000000000000000000000000000..02558031aadb9fc757662041de5d4a5c54eaa3eb GIT binary patch literal 12534 zcmb_@WmHvR_vf|1KtMqRM38QfkPcDlQlz`3TRKGr3F$5+1*H2@0@4iv(#@qCX_$TA znYCs<%&b|n{^MH62hTnCoacG=ulDxKYdP_|w~22f5Qw{y60a2C_ayw}-o${v=j!8H z5QwL>lCMM*of9{byi_BVrY?0n9^H8Nx}DAa+ZTLa$3ld8FpF2=86C6uj!NK?5WQeP zo`0V3z^jy}_WhP`HSwN`Ry-Zi$uOsjjd~Hlv~Y5BspmS~?W@|MZMt1I*#!>|aVocr z5VEpjYrF+(@oV~4Jsz9(n^n_?yeNAil>KG|mE?;jxI96ZP8D~a;GXHc^p|^L^Yn$o zRUhAbsdzU@f^d_b&**+!zJ3*x@)a#E4+V08d2}?QD;AO7#xhUpjWZEf`$KHP|mj4X6Jw1O*a?dB95fK{t>ydR$ zLqUpbz?(F&H;RfMM=faB*uv^OPUki@y7yNGM{8U-lyjq8_LlU@)YYmTnXRm>u*if+ za)$y&EsUExqGaMZZ_x%IT%^XCXbxBXwbPj%Vje&pp*!vkQt9k%rL;?!z~ zl$3Dr^72|t*CqY>^*k{t@#d{t z*Y)AX9mLVeT0tDQH7Snp*;9m8wWH|PT3|{F*?5I@77|G;EG&HTZ=-XhH6$V_sk_X4 z1i=%XBrD?SxhFVP9{e^-?jA0#H(VYf_%HUF=jgb2sY4Qa*I%d+MbO9@F0_SR$HW{Q zYQOdF-8*7p;+wZ`Q}FQ-tEi|11O^@t8>ZIOB!2jC_3LB)&skZtJ#upB7uemkL%{3R#uO0C4@)j9&5-2h}z@w3QLPhm)b=9nxo#RVLh{~?h`}a!f+b{jF**wpj7v1FK zuwg5{nUCZn_C2>n^0jMkdqfPG!b7gT@U8`$UF->({dy5JSHY0=WE>#jhDLO(laylOGfb< zvK1L#ym%oYA)(tH{|r%6WcN!oJs2=X)vb zT^$rmOjuDl=hgMIt3!^9EG#TbYX$W*tgJW@?S@V%veTZ@+S;UQYHE@1UnjfWB1g&@ z?=SaLQB$*7PYGxzJ{4IBnDn|lCzPepJ}$VVSRZ`{RSP>FZ*ECiTCRz7_;L%622p4| zErfL1%0IK-U1;wp({`S{AxbAFC&%5j*A_;3#zpZV`)$^JLPCE(Kg_}J`^U5SD&?x|?SB&y*!hje&6E%n z5rLPInX$9m$+`zW5#ix4Dy$|E%L5sAsvS4x7Grd1V3*HN52qEAo$kQPWY*Osf5D?2 z`19xE)JV*w>*9lefB?ky(Ax2GS_qH*5|)m~!Grw#eBpSVm(q+S4l4>=)>A?4h6=B! zbvpHnW>Bcs-1^IN+1dk23JN*~28z?=H_AUXC5?=z`zE3U9oGqCShVy3Y#x_OXeA}S z&K+`WU9cT#t#-7XZM-osI2gxn6rf;zoWbVY7V3!74ryr-)7RJMaoQvi6&3y6+R8ZD ziA3^grwFG2SWrnKQPpuTpx{O=mb$s8sTdjWL`gO8E_SlT%V?IV+d4XCx`mJmoNYI4 z{2tEJ;ASGYgHXxWY%pQ(FL@_l&6y@1vNVt(eP+3)42V&vo>z0Snrmoe^j-Ns$nWCf zqFLj7@8QFTqCXK)QBgen{Ami=>qr|MWYui$2;)c{sa#Kw)H#&$&fZ?N-bpK&SK{Z- z7-r_?SMYZ(RPRjOEB-c4QIRi<`H?&WZ)L2E5h|QX3mFLoDkDaDad9Egm-~<)@Ka|D zYsFfz;6MfIHB9V?fbl=`f>}5e z2P2XN-PmaaNKnzrFI$F&q82-2t`%C0vahbL9?kgTyv|kxM9Qo= zE7rg1GW+SF|CcZ1Po5xPW2?97I(>13+TdhjKs8-og_X?ge z-I;d2=6!K~YC!nxwh7n3t5>gd%hX-Isj#+u^m}WwiobGtxCeFfMn|XDAVjd`t-O#V zurln-`g8@(r8lCQud=cd_ujpRDtpx55;yJ>o%4YDi-Q8GzjTEzpl>tU<>Eh_H5J2WHmS7OZwJyFL##Qy4|e6zcz1 zk@EkbXa2ukT7|0UC|1vN*cek9RqhxXdQhew+-wm9sZZX8Cq{!EMp7*UCIF$+*m&64=WG(04+}!khtMTBaeUe-vPyZ}- zDmgiMWmVOr^62f|J$hcJuc4u#2`ruR-+%rL@;aL*!+ON^v&mx-m@Y0ZPP5wa?&M1x zHpAsUM^g)~M=n)Wkt&LPwQf!ye0|^K4n;;qg+)b)+Sn9MCOdGag2f zLEcHa{aX)x(*0Nk#U0ZY6C>x<;WyO3#N9WJN@PJh|S*Ad{mXOEh zNm*H$W}SOnhIEWRdt6&*=d~GkQ+kG|0Lcg%y&~14i_$P({4E4^Q|G53j{N+ zZu-+b7sK$nM7(xXvU%nF*d`^uD|s_>b06fgnvS{tZPYzpaD$@fKf63XoW|iilz=th zv7NgPi{@y`z`#h)!XlQ|-M@NZpaBx74Od{VK^rwmn?6y(j z6A-}Od`o1!d=9ept>;7(ql(?xku3uQgKCNK-ACM3HH-(V{;i$|x!Bd3^_-23jXVy^ zxR~th>#yFGsUP#P;^@>SZwk8pO`4yd_YDc@W!4fe7?~chnFAu|O5hHmmX4knc+cls zO@joAxqtuuO)RX1D=I3P@(IJ6Ey6&o%LlG?XS>p%dCU&$A8I>`9BYsoFsT-L$LM&* zWhJ|Z!#U5Ci`y-6?z7~2G+u5=$-#jKB!Y35`1eA4#KzB(p`js*vC=Pls9p{z2<*Kg z^*j;*=TN|YME9Y|Z~uEzYl>=W?LOBq2?bp_(RyV%k+0cHHsP~&L91EW?yMhKcuES+ zufIj;y;@)%S&Vy^{2g%q!e#Eda;{R9qlbs!c)9hvU1?UmlYbhw@7#&9XJBL;t#)Ef z6ANwx$b@%8zXZstqkrSpydWe=H#cQ2CbuHrr9*wO)*XF7CA-NH4Js)s7u%!cc8z~M{$O)50gg6~n``Reio|4b21l-qG*m3u3@|<&nDKXU`}_N4lK38Y-!U?R z-%$YD3@Bw46%Md$(M)PDw6q=rzJOi12O69jkj;yAkEpWV^X(Fwngmz|qyFgk9HCumP& zWMqU$6!M5)8O*Yu`+e)%F}0B0g2Zrv4$iNsTDMz-JfQ}oqlp48;axwOK)7VpAAjz} zAx&SH;0+R^9L$oZ1*PiS@T-5_f6 z^kHpoZlXRiAz+$W5%KMVQzT0hM1!Xxd?MjuFewXr>ECXWURW3k-cpa9 z_(~-A;cv+ZgbnyF(m&zh_Yuu~_+tZ7M@L6b4cl<5&+f=2U5&qsWmZO91{D61wPKYT z0j4>D{H5O6pSY_Ol5ru*MzIEW*M%RflWSUMYQr zg&Oy5RHa=(YZYMLRoLCSsphWZe!&ZP1#4!&NgQ`RX8ye7yEtiTsi)q^@bK>V^hGp8 zCGPI-Alqc(*grvnVFRV`?byh3d0S8HD~Ti!lGUQ4gMX6-a&Ar^y*Z0+un)j9$8bUj z`5fo~wHH>hQpEC+PA z$*qI6k-cVA$~Bv%bv4B5n8TNo!!7H>NzkJTGiX-@NbT?@`)RxkA=c;dd}|pQEb$Oh zC2c}b>mt4w-xad?7O8oF;y_fE=r2kMT`b_i^S(4RTzH?C#|~E|TmGEr6+9$CeERe$ zh=eyBAoVK3OSqnY?#5zYbdPb#y=s$ti;Ig#r>ANCCV$J!AGX@rsa9A$rW6YT^L;&7 ztf@p#FR1Ek148%Dk2eC32tMcLGWH7WVdb1|p*mUX(^HGX!S7w`HV_VMZUp*@B3QKQ z|MTbix6&88A++`1L6Gs<|6|Iuu3yxXmj@rUjr^1^qt!`R1XR(JDA*ZS@(!fYD^by% zzK~KDa9G!|NF+>6=>fnOpe!J)Nc|qXv#K1+X-jeA#tnBBEm>DrKE5Puc#ZL%t=8@B zZQb#I8^C3^oSdA{AZcPkM-#D9zDtZp+5;$tj;5x|gZ=$))>!v9x(bqv4M3a}B0o3) ztZLX1N&i!`>OMStr*%1WMQ{kX)$vxrxyjd@?Wtod9@@wWE% z*=*Vr#nlm_SQ(o9XI$)-W1Puuo4n;#lO5sI(no7c-8I!_L(GdA3fY9hdRL)p7j?4a z?cwl4`B{&<{b0s6U+L)Rz>moXXRXhbuz@6ni>Mv@DS&ETIyi8y4d)sB^1cG*KgnrI zZ*!u`*~JANj%CWkeQ4HybCCjK`A-lLf=R9T;|Bu!quP%GVn^kLxKUD{q%!p3(_OJ_ zzH_M*Gt(dV?3aS7E>-hD(BX&NFF6LkTWNN4G-};qlBXZrfX()wjsf~?$&cdLn<<_ zGd3s*IISfHD7j?UT}cql4_-DNs6xOBTd}aPKzB8|I#Vp-3y7zySFfrPKWr_om|iLq zKJ$SSGhO^FpM&VBE=f#Z%c&unI%!wF-&+@}R4kKd%! zG?eR@XD*I%gsAvso-!UWv6Q|(#q;VNy|)bj|SMyx;h#_`(IAi!E) z_xp&8hu0KFDIUvafFa7v>`!Uqwk?8%;k!Fi(cIeV_h4rSgbIjP32A9aVm`o^XtG@e z;R-H)1yW<%d9l@eO+8ah5_#cs&PnwFv|($fO?e6n!euo9x}G5{EX?9>p$J=1b!DaQ zQ$iprN28nA*fc^y$#ATa;i!;qOsAlKSsBG|fgbTgbwEH;k&E)aa?L*=fP$TUz=}^| zdM*jj4dMK2=mCh8Yd8Xk(yjv{tbJ3{Sq+^_AA&SLkePDRrs9oH3Vc;X*b4qqx_ znCBrt1+iGOKOl{OUDP))*f~6W2gwmI@UC}m?!e8`X25C@W`e=OzCN;NZ4>vCHYR}F z>6wEABOw0B7#L7!wjG|H*6HZ#iY!&nyDkCu3op?2(n8H79JgV25C@BaDT{$syDqV_ zv!fs?`l-~Q6`Nj2NP7(8AwTRY%djM~anFD2qxORb)?oJ_*zwwT1#1=FZxRWjx?n9* z()C!j5?f z%pvX}U|IN>4B$%@;TF%HJv%o^T0pJH@kco!mzf9x?FOZ#r8fi)(o$2Yp1XQnxYIv- zrZiwy25-QMc9DEZo8qFG{U+%Eu11R@FJ64WCnICM65gO2%Fe+NRSHWHPBj^yb)1Bo zG=q{I;IvgY&)W+V&6f!r*t<22K&(r59 zuQ?P-+{J~r&Q;K$v1xpfh=5q?^=n)>t=!z)+VKML-c6a2_&}T>gPsTqCP9ppfg^18 zhYmbBL)2=I$0cnVUC1%K9wzah(o={M{ zAZi73eIMc!LNz#qZOhoN9ug9^xtWhZuoUHU!dOuAq2F+>@B7Q3N8H$X%AF?%YcA_W zt#H^H^^3OWKHhrlT6^=lrOVN*AEXsH9NHcSUs01QsU_-pRJ64A+cP4-fvV*eL?D7` z;c>xg0@UEs4ym!Rp`+#^Rmht3+WNk`k2fqV0Xgpc+j$dhie@mdc~{^Y%Pq$dXuklo zO~Pa2t=8WUnWX*BoM`hLJ?_L!u&XV>k4AW-H~GiZHqgzK?83s=Y`BJGf(($@8$Bql z*>yA3Ok}q?`1db5NGy#?GJIicYuZXCV{j!M4s3RLxw4w=)X5nH()5CYlU8RYMXLBO z8*2G7b8~VOFpe$5I$0lGRrj(|Oc@(f&(6uodE4Q}|7UnuDkCGq$vtu3zVZ7kvHBwg zs3iQb>o;ylg6!;2xcKu&Q8Jv0|E0uuNR3SW`*=vqOl6!aDPD{!+?E&>51F!ac}$Ij z2VOZi-lGl{qr8EE!9ehPxb09XYfusri{m!dsISm;iGu7^=Qb8rP1@o)ecNXiUd5(M zQIe%FNEry(4Q~<$e?B^LPA28_2V$o*kdCH?6$K5xm>htj`o`T9#=}emtt|tG7*^VMgM$?n6&;lOt)UA;4Go~4Ev2kM zu&sKinsYB~ak}d4Y)LdCKr}&R1p5*d9nCO!1JG^&+Ix}oSC8+~)!(z;{}K?OfYh?% z<^VL{<))*PU|?pBva`3(0Bf3%oDyNHC|zOD`jspRM@mAXa#2?dP(eXPCTL+}!-F=! zr4zP%EI-D!;5sHUf#>&KnGA({5Th~`s1@nofLcQ*oN`GWIZC+@-@LQ6V7f-?mAJ5A z*en4q7qAu_5=LP2hx&S9XsUs1R!~rIE_AXcVA5=xsIrgaG`$Zk#TuZVj?xBnEXu!C zGsq(GHI2TlHw$E8vY_kJ?RMHI(?Dq9fbsV+-b(7|kOEg^H7;WbIml9ib*1Y8gOHFA zSe(htOUU5hN`9O$_#4_(W1osaJf@|mU(X%tm}&6k$J#e!k3+}skoxKVnh#+#g}Vaa zp-N7wx9H$fr>!Z7;DW((mR3h6hN$P!JbChKQ$4SfHft>?G!VNf>^2%fp#Z6#?+&d| zr}fhTP`_t8IMo^1-#(ii!%m{Pq3NxVY?~5{*mJ z-7R`l+ThB%dMhM=0fQgW|6U&+9_|^9LF5oN50C|#dL*INK;eW_ zJ-Z{iiJg_1sXc;Z2$cJ}R`srz)U@^MV@Eb2BLiu7_Zpy<9}8<+TLlZ0S*Fa$*jU1L zl>(LBu)ErT)EN^Kn$7o_oKVy&(-=R&F`@4tZVbSqUld&s6(65dnbCd_>VEhpV1o;rOLF2+Ak&G#u}t&^oLrkYJ}Ui#AI>MLBvM(wdz$oy+(Tp%Z`T4sC&X_XDZ` za|o?P95Rpndw~SZ`k>C9@0pogrr_%AX6bSzK&Rqm?hu15FSzd`)Xw}*^;+VR(I7uR zzvA%m>E12zm;HIFRRf#VARuUIF`=bece2U|y7Eg<(C?-uU%(q|Qh~ONt-7}ZI;_#} z%)!zloK1HPGc;J-Fos4((v))_gq%;GZ#sd?oSmH&0d9+W$Z7|zO7OFhRweZ*)9i9~%a@Y_ z*8osc-I_^9N5dar;G|IqTY;2yM9&V`&R907W z=rw+OZv94dZcsP$-@ijCm>MxXN1w-1dtd+V(Q2mxX+{0 zPTaVJRo!<1}=xA3l~7ody-?^q{@UdGCF$faYUNJ``RY*{_2X- z{oN0}(B#tMb{!60hRz41osM_Jj5>2j9y1xui`~9`TS8tQm0M6Z2chK?CeHox&M7`* z0*U;RF|tuud=t5hA#lOddzVa3SNBO-46FTeANh-yF9^A_Lb+gcF7QQGcQA$btBsHWgp|AmelYEBBh+>vVh?g4p$` z+60f&f1~H}q`sS@C72QIzl+u_7P&(|ZTn}@ihm^`hY1^Cs_=;LLRBV5=G+NE0jM%a zR3CrV>#O+fcnr22B147HlwRn`cBO>~xptmDI(G3wTK?E3@GS-it$?=`nla@X zn(g-b+`F(ZOS`ou9`fM#qs-fL7Yn%MW4w0L@$vD~yMl?aFsQZGOs~5jMe4-HUHH=a zf5`Nf)B zL?Q2}*R$9Kc|k7$1~#(ubI|7oy;jh{DRCWQ;If?Ra+>zYs_|G`Tia4~E*+NmvUX%V zb>`SwR_(accURx8ycrxsR1|vSR)%sPqABn*jOBFI%cYd>DVj>ZfB&pmRCEJrR-Q=% zdrW^^HkHT*GXm(YHmgo~0$29Wp9{~wyU5GKHYifyqcZ0-qLu3xrGTL1F;~& z{vJxvDY|07wTC+HVKB^+ubzkHd}d2c=8`5qI=BGIX(GQ<{$v>dGDwFem~DY@r+hD# z7xaa^z|O}>W4ebBH1PkkpH0ToGVd}jnPUg}QE9h$8y*<;H5Jk~QIou|`y;l~>my^2 z3#6sfP|N*Eu9gj~F>X!2ogM?WMpc-ba~w}%eRn$lWqY-u+jitG{nducXa93%EY{6c zhPILoK+2F*sO!V0JN%ps4<0-SE4DS@PmyC~Wz}|BjKUsiJx_$rI5-Fn)4AVnuH)w9 z$P_j}Iv}L@A4D8{_*!s$iY-Ara3f+^by}c-}3zQc$mSI?kazo&aC;h~lP{Iob>_)RP1vNF7$=HOv zb866I4`_YX77a>9txjfFR~OP>js4r-Ux2AD%)-6A$gn1HP3v)k^>|8aMb;bIxw${d3LOz_i z!@>QzUX_1%U<3niQeF5inlxLfuHe^{f46cNF29EM#W5t!E#F`InN+$yD7ZL}g9)|j z3^@RHL$K$uL1His^`H1HQ>JS1^pEL;7Hg?n!ll+>GL_F{);%hF^)XqL9w%{>cNM(D zz}#O|>V2P}^^hOny8r;BxSWgX4}w>qkFE(}gx|@E8gp_~z#6Tj}!h@|4_V76l-70Do6- zq_3=~NM|bv132kmk0e3-erKf0=EeHBzO}No#YzOCk(*luVk(6*5itCb^&SVV)wuT| z+=A&Ww5-(^DW54-f zxw>Lpb^fcEU;+OV6^!&}mF*RiY*EqEOY+k2t9BNdFZgL*u-uWr#*PHIx_*g$bpokv z_ZUDa5<+c8?6GhvdRB6^*y4!O*linr%ylrR=+_1sHc=59T~Q;jSbG>xa&yJ?ipv&8 zs@rHgmiw-x(8)3-WNI)XrJARllbgf0PJm6q^ZHmLYmh!r>k30hXFn_(iw&gQuN|q8 z_b_}#My1b=cIMr|wx@fXWY0Ocmm4t}0mvC02b26NFE1xJs(3U7k_FnN(1s|nJazo$ z;u1|_)jK(eLTY)xE!SwX#&Y`U;gT^ocnAVKy@xH!a5S(o=Zxp0k?|Gwr(yDk&*}*}o6Nev8rG0$*ETK-a}^3p!wNi!ioM2#4!FqJnBrxB8Pi>C7qPysU3oE5MQZD_y$!Di27fi@}K+(YnLWRZqeH*8WHLI(N z%8C=w2MUAte@c|=cjQSqJM(}yxE8PR+v{R4Wq*AX&!k)v1TTaUjS!3w!GtSJcTQG2 zpn}aORV6S-z^o569;3EB zgde_=9{KV^ks;&8)AHChdpkQG=$|tOHp3nO&%oHo_bz*n_-HYZ-D9GgNVSFhw2Aoh zy(3!~@a0Mf1|4tklo009u0GK_Flz}RlZQD*#QDj=wbza1!Z5djK)3XU%$hB@_BK;4 zgBo;aG$n6sZw(jD{GtEG+p^NXzMjC48yI*8Tw3}&!`EtRUm#`J7?hWkkx@h@|L6JI z&sK|C`%E~y0=^ofv!wHb5jc3)x?8S6gRYdPTzS3{;%lh&NPkV3aDmukZf=gp^NeTO z+@yKLB&Ci;=UPh&;b}nBf7&@_W$`ej3TWeFEsYfc9SwA^ z0cI<^px?qfM3IR4j3b_*009a*%!1q`QmYd23v%R}ckUWJm)=hGJU^M;+LDLtZ*+7N zgfhdtW)-$9mqxWJW8VaGGLJ1qiE+1nOArxc6h;=Kj_wn05($oP*t0%)HPVhn#1{$k zf+e2kuF%RY{1Zr7@ z$sZbvR4`4bP+{?XVOV3N9NPfMZ>UZpac?SOf+sxSi zGKBvdgMzyW$Ffe^66fZ1?hxF`;MXg^HV*!^CXw2?t6i@r1nruAj$!e^i&D$dYrW3u wE34nkL*M^Cz)vpvUmffJ-@MY&<+al*l+=l@?7ZOrGaw{Ig!}%7d%LTt?x~vB znW?Gj=@<<)c~m4KBme+_s;D5N^)>od z;pOjULt*3RU<&~FtzYHoXIcw-xO{RVp+ONf>`By(5W2m%lK-q^_^tZ;J#Y%5l;q{k zZq6Y*2=F}-|M$@udZ7N?(Pd&(uUCDd-sbFntNdIf{^uW>SYO{x(S2}C=;xmQ^F!pg zcFfr0s~z#qGkPI=pV;jInZ>_5|CPS_;eS1I&ynQ#DxIE(Ss!l{{$e48=&ym4!bN6P zPk-Jt?A}G{Zisg@dOzzQ*sg}2ZU45veZ*`=&$D$ikN9D!_w~Km!76@!W*C8(ddEy$16hg9dvQT7c*vGk_)fY^e-iQ7V z`OcMJOGYJl+a2rvQB3dE;oa<|r-%HY z@@)@ax%cBftq7MzM(?OIc?gYq({1|zDUR~k&+>qCbVTZ`qtK}3=XR{kM+C8PQ;W78 ztG-Ag?=QxGB+7ozt-gJ@LcZJJ_R;sHR7ej>R7yFIY}w8YgBY1<1{wIu9e$eEP%7T!~c;&h+uLsulZJ)PR;gGs@U)|69haH2~ThrDTJ*68b zas324QzvmYOjvqZr_tCE5`()6T#Ay?<*Dan&&5fH>2BHB?=tX)Eqq@QY6+CH)uWH1 zikzOEj{YfM94mdY*BZai>diQu{o~i0Y4i#;ZSON|Wi2#z9TC%oWdD_Uy8C-JG$JPB zcwiR%-iQ3bjGtTTYQB|*=3SwwN7OSFvkwbC;zmY%Chdwd1{AJdyV8Lo%K)&o)8 z6gN9ZF!GZ()sMiRr?cq7v`cUACkr>tzs=6QHGKY7USUwVqOmS1H)d|xRn6_JRq{6p zY#kW&ckG2Jg+P8ZOfHwxSPk2Gvks?+!t)y-}on`3(~md8arHbE#Ib_ zRWwk1H+(+o<#!o%XipT849&wvncy#kJ0)M#t!pN80NZousjId4IG%`uHZcH`BBrXj5j(doBDX<$;5jxVC&b zTg7s5CEYB@2(!{W#%sxphH@5xv&6xdk?2PuYT&40eTDP-V)8y5H-FBr(&cnAxv)k3 z$YqcssQ_}B*-%yFjx!DY1%GyI{ntH3dZ~=z7@Ax&fA9E?>Ujdf&)KB%Tl^aCGxp=* z6xTj>3e&PhtRvWVUHCzX?Uz9-NDI)_-{EIuQSjIBJsD6}INqhd9Vg;%H9gjTX_%te z`RXSfqIpBQ=j*ksm`xWg)O#eS#zKub@Wy|2qnxUYdiN=5RN@F z8Cf9GwmitUJmM){1S1R-Q8m5?yU{N4ohy1J#r}YjrWbJ>;|v|Aadwq4wyELqVO4}P zrO@*lMeSN_{8aXfsuBicuVPkt^qR-$evO<*8h2d4QSGaeXFH<4Z;g$>1ezjNU|%6d zJ(?qpzzbJTlOz`pq)h=3nBR*W)D3GlF) z%{o6oa>-nfYi{0z>_x@q%bA36LI~4NX9DLi#dh8YV`#1>ibN*``*|CZGCe7N@H$KW zi!dFQHr&!7oTf(8J4ypUIz?LL&*ctCv|Q~Wk5?c9oHAyerE|4hBZH()i*pzi9W&=Z z+X$@Ck|~I*Pt|VSB3j;cWc$Bxx{-q<3G;8cYykvineADI1z5ny8o3yhTVFME>%z_A z-0AasuQOkEwx8PxFXKtVN9agFHxaq6*j`~~$-aTE2>ks%wWJ;>65lt~bH+_uLY4G_ zb4OVs{c0qw6SDjivuIqOiJb^!`ZE~ej17{zb!&QR!U3$f>9B$bZgV_h5|*PDl9cez zk`x#i@iv?)Q>Sz7?IkjjT=(v2%7-LMtmTGC84>3Z=#wIzpe)o_nH;qT8+fkq1LY1x zo^ypeb(;Adc}sneFAv1YS%=#L^Na8%Cf4R6#?+Sex6Q$YO=y-!T9^1X3_VvI_cfda z$!c{B@oz{5yecj{5nQVuQVo)=a{fl{;7z8dWJHS`Na7}~m_V6g6KBk$G2VM2j5P=r z=m4zPyb6l?-3Uil50D8fA^pp_%qc;E6Z)>H-ko~4>JA1Nz=%jUIr-~94!FSyYeaAS zC=Q{m)E)1{GF%%-=1+SzManNX9Frt7m(obUai>_Bv)rvGc1~QZ z5-b_QVoZns)z?u~F)Ys8*rjc0>mqiGa5a9DIOIzF52}dEk*qj6x+{;EXI|MbUiDaf zqgR4`0fDFZNtZ}!s=T*RAPCE)Rnu7BUNVAF&57Hjd!kBaU`V`|oNcFHD^&kUCW-ig zB|UAc><>L$gC>*qB-WR{Wp$+Ife!mD2QLqMin8^!-{^~(@H1XkG`g3gmnuJODtFw` z1d~XhY@x6)&6)>2wvaq6V}2?Z7<43NArmJwD^7V)4a_o*FJ{BhDE{jI#YkacM}#17 zw9Fw2e<^p|;rB#^W8rd3;E5X_hD^$xv2a%hwugYQlS#jS>0Unfl;*%cTFk7_H;lWp z_}Ps|i+;4MfJcLo##I8UGVVqm(J+sMg`qi=FN>F!fq-kc={EW)8TEMaS2=jGrl|NZ zTS1U0Jrn{-T$wGC+C2F&9I~lLgC1t?Y-B|>3QKKhSrZnCU+a))L!Iz=x4OEPxKJ4^ z-+cBGmaQVNPY$}9Vo?-GH-lDpZKf!OSK5cRh0Z=&euPs1_D7U_GJ_+hdIf=EB(9z; zTrpZZLM*{FmBCUnZwP@?ON&;5PqO@5vHfS04oI{PRDgh>m_2|#x_nP4B zbS-j1eX(7{c{*d6wUez5hO9 zD|XqR53**hGB`4$10ib^TngTB*4#>V?jIwTaEn|N@x!Jpqmb#W#R8M?^lEGWmL-Ug z9}F*gdJ2Wc9!FnRZY7*tFod>rQP_zQygFkqYDymvO%9IV)p5&R>0@egy?z)S#pYBz zD`qGEj*e)PdFr(({+9>!X59MH_-q3dlFCC_?3Dg~h~L0cQZMR?6QNzp;?N6B85#x= z8YDZY)m>s7pCK4Y#P;=n=X#{pr$C5pOE^n5=_eon{CrLgd@`@=OD5H(7I#cdUbR1K zt9YEVSAl9I_^sNluJu-oKc|RcA}ym1%j$p`P3uFC*|rD#8aeudoomjreX7dRKqG<% zulKrwqaVPrX_X+&LenHbMIrQrm@vcA^_&9wHYYFArjp(jV?rh|senXxM#8t7PeY%5 zo6cX2lOvUfD|a8Q+#6zy?k#E>8yIFa=Rce^0W%ryS0x#KGuTGxK2ajeRfVlNUs4Kb z4kQ9INK@A2BqcC^eJME7U(*St&>?$YBJz&TTfJoy!0rF}QlW0252^bl(zluo(rJ|J zkddwga{wpiQs`FY%9*iUGSTndiT>hN%MKH>+-GQ`qAN7;ME$oHKMA3dIqu}Pxsq~Z8^S#iy37*w?GIoJprGkG5i-iei_M|iN z=q&BH#CMaYkj?G!Yw*_5r!iarVMv~Mgb;H%Fn?}K>xcs5Md1A;_hD3_IdiyHIhH1{ zC{1M-ZN5&>)e{ZI5a}03NvQcTCGQqv>C2IYYrPm|qu4Y+Q^N#9DplB^>S#1a73ma( zzJjfcQqXnG+*20WHlYu%rE8aBM`8ugSP_w=eeCqeQ+SifguZaWEM__N75ldPHTL1p z!p0v@T^2(l)YMa9fu$5ySkPj+X&WrlYNmnFo?=`*rnc0%Dj>)obu2t$?45@-HZgQw z1-nbUG(5pDlW(7(ue=`G8Nu|HTG(yH7n#i0zzOeLDsM=9mb7zPF{h$9CT+_JTqmP~ zj?b0JnOJniDqToV6-CaS`H#U!nIBFK^cKxpVe1yhM_}6Y{FP$2SSPk!Aoao`KpK#f zE=ao@NO7fzj|B8GQ!TvI$to63Mv05NJ?0<(;p|1()Uz3J#2hiU7@S#DR8QVij)F!Q zH4b_4z>4n< z5q+xU;jrtW@J$w3^i54*XBe*nP`N2~$M~?$BJz&^bDa(m)d2H**emR9!m{;H-;IxNRijQ={W6- zLm-mJ$`OGaI_qp;dy-L%J0!yxTH;z2_9OF<{e|(S4+a(z_cmBqdAF}HBGfx=W^+`b z9SXrgWXSO6Na)ce_ibCPv?WL8vwmoj*J+k7x8DPYYDvkL-`9%ax}rZ6uyw9h(>XJc zN=r4RGPEB0DdQ8n#Z)JT2i3617NYnod=yWoFUZ;>?Q+ILm0%5(x+hLXavh%~6Y_sZ zeRzdv2H?gP*^3d%1<{6I8E%HSU&L2ce+rN;P;nH|txqd-r1DeH`IsxP-^Q%NQW#Mg zOXgCeB|4gF8K5{(RpNhvWeHD$wKAt&1avamyNLn7pTrjE1|&^`sjxXV=F-@f_8)}y z210?AQ98_n|A2d?c-hOrVFTLbSzP=yyPiv$wMF4Bv4?`ZG1#r-6>NfXR5mVQqpM3- ze+OJHtS;tp)#)X$8UYP9GTy9YZ)%Ch292KgB5ab57RG&!bm;JS21%=_4AhM56EaG^ zX=BM>pne}@A|O=`gpDO0qJAMf;Q!9osF!Lsz*PBr3Gl?Uw&7o&4wK)_M}@@>8zDXJ z2V14^rom{rnk za`Ai^k3@+mOtJ7tK1zPpMoWy$I8kG;Qnl%L{uE_6;5-E*m%;@4B{d>kSa1igc!G*? z36w8;8tt$ZAjA$srp-AtbzM$eEPNT+pR3LDhZH55r>gYd4DcbkNEVv-I}@ZaRf@?C zq{tK6d*e8-&^1UOTRQfWlAxN0YGfIqKP4k?_Z`+dkW*n@L*?S!~$R|^(9EWhzvx_cGOf8j9;8d6*kVFuNgkucpwSQk6u61nqrwG zmtvvXMa5#_Es!7W*rHI_EVgcZb>P?X!LArPGmHp~D3AQQkBuu0eb{(El&Y{XP(|(m zU9|Elzcm%bj15=HWuaJ~SM7puNM&ib$VW=0-=*ZJ{-kc^fcNgFIn>tVrJ9Y(Fv(jS zXB37HhSsQ|!xMBEfk>-fYQg(`>IATiN8{*|xJ%NvMTFlIXEkmEdQPClrwY45DvD z_FFi%#aDG^rJqPaLy?O9P^J_tNkBSeG{Pk)7s`gQ%qyuMfQ`1Wf)qoYPTfB{>Mwtb z$uu-@@y>(rEV2s$e^-f?!a1nINI~a+%j0q6Q&>0{)|SKm89_As18gdc2C|3N)=Bz< zIzZ+GX~MzKF96e+4?0z+piq}E*!&z6y5-8#uI4qTo2x!YWm_Zjz$@X1h9A9&X)28M zSwY7rP00LV1QV8^?sD}#^3+RIK{oN6R<34V(}2ZxC&dbtr37thpp45h#Z6zFq)nRk z5Rx$KQY~CFxm7fS!GuRtdJkcgnSr&}NV|k*>I))s|FcH&o*)V!hcrRISN4ff-!A;)_gE)NFjQjm7P!Dkq zN_2rKx_O}p6>glSX&hHt^)jV{Pw=S12fl1+Y#gOiOdeXVL`$jQO9HD3Uz1nqabg96 zQq{DN7n?%^68||*oV0hM=P^>5(0ie7E??o%me$x8{0Gu7`qBFeWnwpk$xwIM4qfL4 zusp5S*YjVH-3bLL{vpZB!oNN+m;iHX$dnm%b1Z|T2Nu$h^?^FK9`V{yG`o}vkGe@T zqG#N?kmQ&t@4*m-rnVMi)d9ctnP@5sbSVn{^F3@Zt#5Fc$?QfQbm@fTegq1NwJIFd zf14^CfiY4jJ*VfqxQCo$;$jSqiDN2&W+Vg1Nz6|+rru+*5ZB%GqZ4XLLk5-FL-V{&&B9Dsah#Vo(Kqr{^5|7YZn?C1{8Zw%Q9eu8-N@`6t z+~%z&!2)-T*OP}7LpnLk@9PJ{@p(CsPI(9Vxgtm2n`LrtWTPKte#jfI8_m0c|E{q< zKPc6QES|=bP38SJGrk-t9BUw>yU@BP@zjZ#h6*)gY3O$d(;t`zi4F&HyWl@`*|qGl zdnr=zz`|Y$wy!`f-#JKCfYNE)BhlLGx$no|I@WAET+pIT!HuU-@Azs`=LjI%8^OcW zg;;wTZtYJVVdz_-xkhZ0Ev}7;q#Fz#(_Ph}lghbRvg$atojs+8?uH5KRoGE3Z5RpE zIGnjbH-96Fc4aZEp>3JO{80s<85%KWB zUXW*@7D5;W8;oc!4eU_;9?n#A_O*w=WKa!Wi97q(76g~fiI1s($0t9o0D3eTU&u_9s z7GJ^_&;5Nva~nl{_V!wl;jOhDE7sa|))^ulMm3MNJ_wk;8Gxdxe#ERJ2^qDaE|8D{ z+^rbPjNGzRSvWpO%t8a#pV`lyIKDnPdUI*F*SI>GXdi2~*f24B3827)#n*`0Qn85* z85WkW%y5I@`v5G^VMqa5FwR{j<22HX4a6`egCZcintyN@dvr<-zJj*Dmp<}i`)woN ze5SHk_CVRYy!VrRqun&%zYf_IRoDd3j1r2OnZQtq)ly1~Hu+#7oiM)OUxa`mZ-0#3 zU`iGE!ygPYAW%Q6S6d-5cyB+<7qRJ zz8Ty{^#exec6nm+Q&Lj;%k{y-ujj#<-sZW20*>cAU!pUNqS)t#2vwJsD+6sT$h@ao z;vNKVldrrHhw&i^sGn3<)RyUCe-06aJYj?`3V?JrRc-FG*kRbN1uda~rH20Z@(&Lr zSg8vZFNxF6nR^WDZqt~W%haKp`;Vy0A@8aM#QQ_W&)7#TedJk<6c^KiK`GO{r?9t& z9*~`k+)7GaR&;~t-(IZ>+s7G6_x4B>voFviqSU}a*?9=3|w62Bs+1&7=Jc$z`%6!sMZ&)Q_k zU8zMi{M0J|#;yuRp&!C5R1k5Z42xr|nA@oJOeLf#qvDs?u0*h?_6Ym%VD(8n7l8&i znJerOUM5C+g4AGocQb_NDf3`jUET5&2Axun4tEqQZB7hzft|wTy|Ba}N2y>V|Fx2@ z1pSD0_)nuy4teVYYSuVLKWSAS9n~)!coPvzs{V)^)01!FMahJN8jPcz{#EO%DnW!# z&F8f?%;Y1YdlTLMMdDCVFQr(t<}*yXL~q`wluG1G|MUEHqMWoUoKm^qyDzRvpP1fp-or zctNo^VxrA6TJ(k8C&InfY+nmY`z#ubC9=5; z@j1^>!!rsl<1^x?r*!XnjfQ-mQN&R-s4i_|C~CjdmXYUfTzoZ81{y)73aDf;0e z_v(uyNiR`bogmNSCiSf20Io?R!)+)0-iE%J;CRWkVmBG~Stz@PD-Ej+Jv=mP$JpOk zhw7Xx`2l55GNTvT52`qB;So~1k}c8xo>nGNqUUm%F$;BLg&3on&*rEhn#SfpOCJAk zxlwj*4C*Puljay!L~C!Iy1izBdOc!!MMf+bU%|>U2!2YyBN?XMnt|1Sj*bz~G&@Eu@hle+>7oE)jSGchkul9xi3 zS97uZ1gm-*dbSD1o`IE_PFA?y3P?0M;T)u1mTG~gw{|1nBt@K@Sk@t27W;_1@^w{o z{RtSZ--NW=PQuVqn^!L^fE^n^Em`EY-jbM7yrjgv}(g9Ji=Z4hm-E zvmoJW4cRMvgQfDLw<7y?3y9ND3+c5C7w?N4!AXq4YK{Ip%9q#mnWYncyqEB&V%2ED zUWXyGq3RY$3;y*D}oo!iBhe+E-XowA{PpLOlk9? zjI7O#U2ZMN`@xbj?8S*p2GaRkC{p%23u)zi2Na5UwwYIejB&cXf{Q=>EueKhcV-gN z)x#a%olCZXHl(rXmb1y+h z!HTqM=DZ}@^u&9pl1%%87lDk?<&HNBXs}~{UHW|2q6G!LH@&Y16vfW1=T?h71GWT6 z>Cat-Wso|^@La+=Ne8~kVd*;XJ300w*Ld?5YZp#*DLcdkmT>gCVUL0=hac?-QTI8DJ@y12qKFBRz@2X$=pAw?yvokn#IaHu)5!^ zOUKdP!$@VTPfz_9&)Ukk?b|-gw^^c=yB(yL#%iDd%^nlczkTy?f(GnsSBwbEWnc57 zLdJ~0n?nQsyIRFAcKIjVoI-Q}l1#qdgiJ`ixp^_$aE$ou=RCiGt(ws7;>9u|><=XV zD3$k(<9W#~s&YyH6g`LRfF%EvT{tcKr$-&ic~Z$@ynjHnYkDCGOTT2_L%j+UHu;$| z(^iI9NrLjiK6nt4K+(oU7rm23;sw(VdT1~3wzm{(V)mb!fYScSG$?j#=JXV;yR`n# zd9|_r%67`)LKYl8${wm}WyN=)LvTFtNAnRN)~ZZrgmYj_+3qn)&1vmdpA-2B>Wh4^ ztkg=Vqm&(tn3=AhS>YUJ1HU==-EJkbDZdB?Pn`aXw>fOsJ1bY8dR?e1vss{+8HeDV zHr!3`NGhM1X}k#_(Y014>Or`ZHe$l974C&9FQVyI|Hh2GbR3>YL-cxhz0H7irq@^_! zrKSI`6X8GqL4I(GsKSs2PNceaxoj2M6~%sXiI5=%?r+sR?J^Q7B!}8F%GD-(` zcN`OBw0|smq$c^^-fpD+1sp^mM07DEWrZ1MIChjDEm|nFC;d8!Xl-Q0lj zI1GVX2U`O(9UmJDxjX1qeI6x)jVPx7IYi(smE~mspZ|;Ep31cUDu`|hhMoWb+R*<1 zHRcMj{;!DOrKl>4a0G>ohDi6`zkdw?VD>1=Nb3C0J>s8kqT4YN`nFkBcRe$g1AwBG z%z*)ODAIEhNcqxFvxG$_+YLgZl_e#EkdTt#CTW1NQqj=g(IbGAXDS$bl7q?w&?%Cr zu_$|-5i}B&->L%typ>kvso7D0BV?DWdX`Ofy)PC!oNDKf5Xt)6`qNGmzUrb`m#ufK_7Q zWXx8w-Jb*+;+I$b<<%VnqF+Sj;L~FebK?OuJP9ry*_-%|n zxWOt7asiQ3^uXm_xKq1p#n4>NJ6G`UzeO8^8S`oZUCGB{Wy0WaaC}&Zg$S%?SlJtL zq@TA?$v@Enzz{!(9Yp41JmQ+TDgK~zUM>o~=e4UbC*uZ715~Kv`(}1>HvkYE?}7p0P&TrN|)=>)3olVK|}zeQv=P-k!Z=P2ogn z^cd@zx(L0C`cr1#t-RtY$%<#XDO?95^+qq6O>$K+2-{Dj04FW7WFIvUSt2nb*7hIWZJEyKTD=MXE6h zxeIIiVZs8rC9%P40qy>rnmfq?q<$#8?OCfCt5XQa$|m8Ds>+_29f^WqBom|WK$)Zr z^EDS#*kBLBpOdw(hwz~KH4)&uvL(B@-2q+6GeZ+qQVw=0H>EvKL#_V(ihxEvjc9iS zJ}GEVp=6iFWV0zhK%YrK2G-6cdi)6m~Oaa;kq+y+F|rslA8klShA(y;+_$j@*HRi zGFoh>r|=wWxL0%zv%9P(0ep>JHt413Mr)8t1jI#gkO!` z-;rq~R?n4$zDW$?gVwjvRL!Ks$DD4!)TW4heGxat!i^EaQ!uv|+$k=DrvTj&D6AX7 zz8)X_+j>;BGI)Q0=7Y&h*Xb?9y7^f_g*zTQ(s+X|i531J1}aU^Ie;6 zzvNkD#R!`BsPq%6n`vnb*o;aM36Dn}1ulPJLSjDU8D8X6LBD=%PF&L1caC!RkDM%U zP#yf7kCvYbxH99am7{AG>tDxblwBIR5vXcXLNyX+EiXMXJB!4>jCDGc=z-#xhTkg2 zvox*Xz{bkHs|M_6_iYHQ5$)g>gL2q=w1 zkiyW76wiHGlg;%sLX8*8qB{Ae} zk|XQ5cAR)1X^;5?tk=Yn7>3;3e!mKNOmIl8YY=Q=Z9#W`#5m!F9?D6(d9%jV5!R(= z-tK(SaFfC^^3|AYj#iRlTtNh6;*JM9T~R8jsuk~npO{6gVr{* z{_-Q*20SE7uwLh4P%Uln+jcO^Zf@k_*;#bo4dc0tF`V>+b`Eu5kGy|E4U1%e^15m_ zbXBMPhc8tVhRpGSUjf966^Zyn`W(RL7B@1pi$2A&EMP`8Bjn4_QjzS~L>){bTAuy; zRkM<0cicU!$Z~5#iE{X2)V{V%p$mufFJ4ZLU`>ph0XC&IGM-ud??CJ|!(|ouI?pK- zb<6o03vGy?CEe9kUIS6{AUxqf8HFigJ`PoqRa?l-a1`qNo7112Oq)-ChLA2%>468_ zwG}w@0TRuofmY6?i~a&NK|D_d^`?dIoAf;)DDLxnLBWU1)!Yr#ZtO=|1?~qHsu7$E z$$m%#m#19~edo?yHfgG>3{!6sxIVp1DUTg&l>wGl8m3T@h~CvUcuv|^&4~m^oU%xs zifiKQfndDyE6f7ng4-=Z>lvr>@$ygWW_nv03K7!MH>WPgI)WZyUQ>3AE?}c^Y5fqC z%_yIwSRD0%4d(iws3Y#p3}gqAaouXfljI8OS1GQ$nBVBcd$mmw!Bvc-$riva(nn{O zRS}_x;FzOo9pcv8rp%hsH7|;Oj%5;&#v*XLbJi>SQj369Pbyv3?*g1Hh6@-{o z?D1agQeWT05)421Z>90$X5v1&@SUsjhEB<5NNq4Gt%CAy`tjSC#T80Yi`DKF?on{HDJUSw*d4usq{$T4s z2&&=&VBY?hUC+wJHIZ?@1gx;!Od3*}INGl1F9VlPb)%9*w z(K-=4bt(ZPA)G$~Lqp*ebR9dO8xx=R7 zRSd8m${_B&p#>tX@Z#yyoq7%?`RMh>J@mos^`BOqq~3)ZJ&UM*D-j#~wIHap$|?u{ zE!ptz=>5vt`ReD^3yR5&l$3c&sj=P!TPKGmV>>hY-DMPNuRV{m`=7`C2+qRxsMo-} zN5``s^!&;jO>KA2*mXkLGCEi``AA9Q$E)X(oR<{OJ+}vyc^qd_nX-@Ir>py9j>HIu zMp3=8lX$97pAScsFW6mB^VLLt4^2YWr4Vp+c62I*4evO4jS90-0`e#>=oQIU2SCC-o#7;7ktI&fv* znc@jJ^LY{sV}pBnEr*k`9^~SN_J93l9}OwR@V|>%o6cx$Z4EXyHqO%3*MAtNPD)?L zvppP7Ied5(BXx(nJ?{2@;7bHJDd}ze*)4X9yIO$d7orQ!KirTtAmlQ{NQftwC*_hP&*YOIGgrsYU8Sc zt6wj}@)XoxEEeE)s3u4j^0Q!leG1I~nTy$1|iTmY3-1yIHU+)op z*x~v4`5osdAp(gT;NfHzp}3S?c`e~zPz5)P=er4}e1vs%Fp&Wnbi52Pg z@YWOgPtn|gC}!s?EUrYS)V!c6X(G&LDuLV}M;tb;K&}KbBC_JkuimtswLlwi<43b> zZ7su9w^Q86EQ|*@Q2i=+a&Uwddp37l4_{yTR!_c2EVw1;bwlC>%gwG*ZsccL*$62M zv5*&ouC6XJ<23`!+-8TTcrI!UB9-_xMuR5s0K9eYut6Cu^>w}~A*bBoH_$djEaB&+ zF3!i!R;M3B6O3&PwJ%h?<;vPNEj;?d^e@+6F^wNh7+0ahj_oyb+izFNwY4}2%-VEG zNeC^K>OhOS?oS}+eOcQ9(uY`QCB7pJ=^a0yU^HiCeU?E zx4VYs{g43KwZ7ij*yBd?w2zYW?n!Bnc`NP^7~o4WXEz__UBApu z4u4eC7;G%BU&|m8iX6yQl5m@LX3d^2dp!6i+`Y@CXS3)lmv}jS>;AdF#1}n%({Pb# z2;79Oy3j4zbHU8j{`t8|hA$#eni0~2B?T<_Mw6SDx1wA3A*HLkv~_V1DINh}9&Z?< z?XQ!dO`g!9Y1}Ex8Gh0T7KY)NNg3kg|0w~Z*+-GPIr7{$t4E=T!vGK z#6Qv#Yxn1R$!V6;x9;%1uX~KU{>ZNA_hOJiAIqbP`KinFMiR-FvtvEqBn|m`|7H+7 ztPo8gjhh@>*2W78Pb9MtJ5z;GgQaNGJ??Mox}3GWu4|ay#B9Zg%_DAo?N0>qpoW)b z0(5A$3%sH$)-xeR^`nQ^t{eYGi7nHchVRoZq&(~OSuGWvkyivO$^6QHohIabT7aRn zXD{CVkErNYZTYeW9*4${&brPC#>hQwlrtjGd$~{QXUEfdWB*OTHtfu-ZS%>=!|MTP z2-E`>RjPL9aXUj>ns_yRa!Xm};$h?}zikzAM$AT4$C8 z;LirzZ8xIowdhM}`q6nVRJ^)=qlrQzRdUa3AQ186=qK$LFps{v@h0pIeSW#@D8HLE z)|rES@A&1;-|z^-_F=UfJJ0h0s^ues34I#G3V9k#O)4c;a&cj8V+K1p@4OFVtFs_c z)?9uct@kF}ZZbMjP>5}5X{l}FsPFqH1cQKP&DxeMRdPGz%qL7l(5a~1p^B=A096T9 z8Hw=)kO{b1y+*#=I4WWPBK?nCSzd^|iD&=Qb^wa9YBF_F=3)N_oF6|3 literal 0 HcmV?d00001 diff --git a/docs/_static/mpiwg.png b/docs/_static/mpiwg.png new file mode 100644 index 0000000000000000000000000000000000000000..6776a63fc9a9521b5701e503c84058b0f7ecfb1f GIT binary patch literal 37002 zcmV(wLFvAUP)Pyg07*naRCodGT?d>S#nqqL)t%10(VLB_X|#l10)*bHAt50+kT?l&U}N7&SB&r# z>3rtw7)WALAPIrQBy>o?!4#W-NiZaQNo>HzblZTt&!@6G-+xvbX?G>v>F%`BN}B!s z?xyVQ%$xsc-@bYC<_#h8k_boyBmxoviGV~vA|Mfv2vh6frSef zVu74pD#0iZ1SInl76`C(l>_e=rR=b~ z?g|Vqi#}FRCnWQ&puv`1EnNhHExywjqeo@x0i@wk-J8Q-%5+cWxfcP+eBC=Xk}`>a zfk0-(x}E*sYnWk{n(uGly#82oTE^XtfMmYz9v?|vg(9GbH(Ul2vg4B7r}TrNBX?bvOe2E( zOU-G*^a+-__&_(kk!&?0L{ToNmdZO#1b;0OIb1bkgDO@sZS37y`i-|C3;<>FNVYpHe9KgU-|s z;*iA7!pl1i>WL}=5v2qIbJd}~nJsUMJQ=zd0jYLy@7PGnDhz>+R{wW(Jp<$AIDjF; zX*!s4EQyL?_K-{&3viSw#-f{Kz9wAU8kK4Xw~maYtKtv~w$VALjBh z?mh&hwS)V{L{d~C2n2(^F(evWXlJ0j`(V0%p+ucp-`u(~3mmU?wD_<_N7~KzQJ1Re zirj2r2F0ausdjMbz(}&n8-WE27F3ZKy#oef`=|Ak(#Lk)yXKN!7_ae&s?tZ(o<#=J z*jbtKWuChbkj&Rz;~}Z35Ck@?e|H)d*{(|KE~OvscaZn;$&&*;Y4>u>V3b^*V^LYG zhzLmLTSV|>)0K%pSJ1aV)Cz5xdao$-JCh~_BAM>l^I|djHG3XU!{T9|j@HJE_cE22 zdBs6MGT-9hE?cc&1R@%p#C5ZF2;i&tp}K~)=k57f(+b-s(>>Hp8Plg`x+nA8gMehd z?imY7Nd+K)gD$}sNv*xBkc9ty^ajMHoS)?unjpgzttZ_%LXeIfdvZ!Lz$+h>3VT8(64gU^9tmzgl08=W#hW=6X@nw?OmnJv&f#;SNgS=qI*KT9kefqzS@!ZokO4(osn~m z%9;Y4&Op(^jfF_GHxN(sdfeMXzdRs8qrVcPkC9^eF#9$FE5FJM4&GO zI@|q6>x>T7b#eqG&I#GdD+>81q4YytV;5no?+`Ft*xa&ScbuVIln`aAS0N8gZEXL< zTsKU{j)LvNFrHV}Q2jz@C)q{oiJXJHR?xA#xvV{odSCH1hvbRIc3@nRmdw{B10l(9 z9|E&mnhwy3HVfxNa6bxQpv+-49qoqxcNGkE|G{*65?;=o%M8_QeKuRECQ1G78Y;WnhOzNEMPFz&i2S58q*KpBvPFo2S#@T7;Z_c=J0~A z_l~GJXzJ9!CKp0q(IiRcThYT`d7ba<>>Q>wT-gcE#p?%lP0Ckj!`R7|I%4guwjy z^SzNR>vn@Rz{OlY%5-u8^r(kG${o=snUww;deR3}ulLt8W(3yvSsn#C+ZqnRp0&Q~ zwpQlRW8?xjz&Q<4_aW9>g-M~G|H8}vuujNJBgfXfa{cvzZG~xBwjvOa%vT^G!;(c{ zPHW?MmsFJNJDYc-_Tzo;r(G+}@5%l+N#kV`w+$}C(Ms9wD? z3`gqhh-_sFkq;*DJ}fl2|KPb&HBkt}<84UhTR5y`OL;~h*xGn8W%^~1YH@B^CJO*8 zn^Cfy`%?g;eU>O>vmk*o_2Fto-()xK688%g_Ii>T8>H$zv)<2e z19f7uKp_#cg`~RXH}&;_k24gLS$RSr`{3oN0a=YiAm0dtS{u)2x;_I8_r$DqQ}Uok zrMJ&$2rkQ7#w9mGE&e})kzSG;Idqv)Y`q)C&stcB%`0U;on?|@*F4;2G}*)9Habix|Z zHn&vB@Y@BmpzM_@s?P0;ITeHcY;DeN^?g&*aaLOn7bVI!eKXttkV8>R5w7A``%ZY& z4vid-kJfQmd^`>z;(cFW#BXBe0F^}Wp1w=V61gIf;Rl@Sqq3YtzDmlx~{#POKFADOTh#m%B5^? zWw>qg?Ae5GeZI*!`GT?Wy_GlaQTl(7gm12IY=5@Tva&#*2uSAJCq%M9F%aOsXlbnH zW=xqchaz$hI}MP=qA&%U<7;YbEJcQE?oY7QcRA6?LUUfWc)pmgxqhxYp54}TBxBmQ z5WOBBxZXFG;EXv{p|{RxX!}>GV{gWx&~eFp3yrvJuU7;N2jEdSq9NpmS(w0TLX|&y zDZQ(dR14WUy>Q_|b=8X3m!Pk?`Cy>RG=FpZ+-&s?Hvjf^|AA4B{Q~uX854_5v%PLG zk3ZwX81&B!y=K2B(|sc#nQz}1$r27Dz;~Uo=!UQ9F*X&N+%q!0^pLRL1sR>y!*;%R z+O%mq944t?)pxY`FVY!%IM)W@xaXJkO`-4ST7IDAjG0F?JO$&y9Rp3wtYc5qPU#=3 zJnZ)A{?I=L+Fm6-Cz)?0j=D?w9GvC*DwyhIm_(fm32H(nvaryzff42(Ij-jZa@z<1 zcawYR;qcR7ZePsQ_e^r=7Dc{;|*atcSP95?G@f9DHxLq_fX zWNmG2ICW3PG9e(DZzfb^o}&nGNmmPRxDk5JdN9_}nHqymcPkds{y?j}vtV!fVWu*r zoX1b1cx7q$J}@)x%Qxq%7|d4?*VPpm8|e;T11Weq=-yrA+e0vwCmE?~_Rnm2-F{D| zQwT`ro5DiI3V;AV(E*Od@5Bq7)fqUXr>9j9xeYeDIavS{fb6_B!7U9!Ej>R&r9bFz z#l>X4g4Vfn1EYJkhHrzUI|bwD75U~6A5TcW^hvDU1$;AG-ZbBn@i+pK`O05HK@i|R z7q)L+TMLP4CZ^fLGd2p}3WTUP`I75&xub_KQyZ?(Z zjZ6aro{TZIngEJIguX+n*qMHRXmv76MhpZb^OZjY#}NpH0{f6i&phaLFV5T;jo^nk zXV<;GEt4k)dNP+OkK8#y-;vQM{X3Xd@8(=ty6Yz9D@g2UY5D?Vu}+NdxgyUJQo4;Y zGM`kBn(g!Xl3TnjWhyWsnQsLSdLg^q+0wLcjA%E z+_r|@!#Z06yW0H*+AlXUU*4CvDkX9aHp2wLgfpI+3}*8&RY)7D8uPpQ`g-W}EBlhn zx3b6IvEIiQlwv0kqVYwgG{y@nhgHxC&h<67-{lw)E~+?T&kMCm?v@Ri$agFA6|{1l zJ|nSTp`S;Jy!4PRqbj}C*VqnAkIKEW=xHh!5ZUv=Bft&ymn{uHKv?X#42*X*Y*nux zv&)1pR0`weAEMPOU&qFU!7<*!kBsv*@WnOHkg=y=(R3C*u37Z<>}aO3r{GZN?tAVD z)HqL-J6n*FuRF)3B&p&|X4y;OS~xIo1EaCORI4HF&K|Ri_jgmK1iDMIg99L%`=;3V z>AOD%Cf^&t<7tn$UGg;(9BOMgn`rvoP($ox&bP!V{iI4Ir%rDOuCU}+KteL#3K--< zbg`qY;S{)eoQ0Kds2WliB%n>u>rNj&*89hD+a!_dW#$;)M8U$wLgS^GOY}dtl$bB? z9d~Wny`^Wa;iDtVX(Cji>#!i_uQab|lKJ-k_b;C?A#i8Af7~_=E7^oy2*x{1+ycYJ zf(pIAnkp^j;DbT*E0@sl7}nPFO-Q?4sl_L(_)|6n7^$KTynJi1eRmY{di9|o%JZ<) zn6IEczFoQJr|9oV=HH39kKYTRqIc8k(K9M#F)sN-5Fp4Sn$7LF zAow++^-MnP8z*`%zv4TuJn_U6S#-v3?PdSyl9F%#@RFs0!0eW$12m%D3Sk8<2ss~# zUyxoFd#L)o=}m3#iM&b+v4{l5%Wf3?1}6VfYiiDj#d@M7M&HH)lIRXb)($J_74({x#tqFhnVza?qyka^@~ZUzJvEC>wk zXz}?q5?cl)a+P4b;J)vJDO_CNRC9VczbqLbv1H9|@o|rAKN6W>vTt(6%TGe4n&Cib zT;kpmal8@A_hOPIYAU}Fewbg}@8H9~&QIpR;hhfy>;`W;x*Y&)oGtv(2+?aUv+>;gkgI86K!D7iH<-8@hFF3(?&%zw}F-UUpsWeV~ z@C>Orl`~%Mteog@mzlW0aHsPq&&TXxyvWZbR6Xp2T*{R~8GbGtKN1P+0>|P5p5|5) zI2{zx^}ltsHm2E$mqP#gqEYI6`$93?S9|a9BhGJcbmws z#4y;>bSfeGqhJ<>c^x?F_F*bLVrqTsXB>~Ap2%gu?m8Za^~ZB?KF;;Qq^2f3k!aj3 zFZT<+eEaQ;C->TmQ}UU`OSW8o<#!(H*|B*WeplRDrpiRgIGo#e{<)`}^rv$Bok_{p zG|DoXjsR!8F`X^8Fka|(u~>8(q}@NIKT%mZe9JH;v3X#;35x~VLlx!X+J@k#0tw6( zZWeJc<~x(|XjNt_2TC@IGWO7rqV1Vwke)JSO4QdBoK31~4#sQ>vnYI)5*M&NAgt>z zaKFMLuN*?je9K`J9qI$TZ~a))biPY%XQK(u6mX2qu6xy-Ti4wBwnNR9OtrIT2X^b> z)*dWw!2r3o8{1|t!U2I5%T6Y+~t?HmyoAsEjcumSj*NLO0u z^9A0=Z0cfsz|Mp&IQaqN3>XgGU%?61=W$cMa=@i5I#CV-;if)xb#)C1w)pSE4}QLo zcp-@_rmX7g{QR(T_dJ%%Sny3HdbsBqe3%XqnBI4aWNoU1LwABwon8lrC3?j5Tz*Vn{ZA z_TCv_yhrmi?!W6DzSHl;JtL3kU$&$vdjx)Ovol^S;-MaV4=dxUYiwWc88Yq=eE459 z_ZX^|8*%7pSE(~#$$ZVDTH$eSzZ;FQMPMpl5Kv%&=-J)&@?Ka@hgOQd4J4d1C$JOj zQy&FW9h!U|rc!E+_v{%nW^5OE{HP>YINTx9kcwN(7%p5GsD`nFMZLh7@`5OzDMTIb zWXdn+JdP=?9&;W(IuBX<$!b@xczsSOGhoSlts`2osbJ7I5gR9-g;=$Z0D&{!>ft+| zFVzeJ5kg`+!hgZX$=JCT&!ydpLchyR%2K6Z`_S`wEhMykjXX-+vyzB_@?Z)*@?UyN(%rxJ_`F}UBTKW3#_y^IPUn+6Qd`o4d9Hf)$Cb9eK zMW`B52JKkvU{&mLv$WFof@N{iL}AT&mRxP403dj0Vh7F)thI ziPV~(!PNRcRpTaXG3VLiVEP~0^C~6H)h+eSp%&mBfD#8L)rpI-Z{mZvh#&4LvPvbC z%(qlV${{+r8_138*DM6%O&PMos(~`kH+t1SIz*3KtL5jF!BNg_X8OP|=VmguTeT8S zu?t0G`YBzM8t^Tphz%YVYOf^p#2kj z*;=fn;wegF-x2v>z7I}qZ2v^0d3e`df#Hy1Pd4WvzQmT6$t(!eH3VORDcZSK222i* zY!5HuAIU6_xh`8WU)K#uVF==<39num{vC9@mx>m#ptBykW6$=_Y`<6J$&e6ks|#j% zp1=ZA;N>t0J0sJ~fcw9|_UrG8vWzM@ZA4Khdr3TV%rP}vc^e`v!*m4T8)9jK0dEs` zAw(Pzjge)u+x;hsJU54u`MP;@9Hgy#^V%Qa$2v(g0SRq!wW1#3pV{)d$de%$FHT;D zt?s;()JwMwRmoYI7%#_b<2r057M|WPix{(0b(%!GZQBrh=S&x@C0O;CmqiY@1x6iP zMOucL5lAv%N$2;a0j5XZC0U2?Xv zGHF79dq?xsc)tjz84rqV=;lYj9Pn|sOTd!(iV>@jkRMnW(-;=mlAg(47&*>+o_#gG zLg2{SO>RG`^=yEhZk+M*f_Pab`I}px&-PGU+G2-```J_uu!M5!u%xNB1sy!9Ap~8+qf70AiuEq4^cd6&tNH{fWXKU>ylOG4eDGX!$*rnuE<%y-4nAO$Y`Vk7emMkPAeImf|_BDC00Rw)(T$rB}0$(Lh zc64-%Fy|LLE}3t!N1y=hceXU`8)f>-uxz-HH;3sN!!b4&WAFSM1!%scngHXy1Usi- zp}>rH16BvGlIl^HawGrVIug-Xt5}o+zEP#d`?E~t@4kC?6&39z?xF0Pw=@Lb5@`{} z+cRw@5I1DlD+2rjS)e$OF_oYVUbNgvE&swA(?NH2f*p(;jdvojm7xxwcX=w zQOv}Yde}<7;B?!iRzJ6F064#a1gjdf^Id)(c9zfQ+uU1UI$D~(fK3vxJ55}SF0HEz ztQARfxZ%@xQ#RGO0mBGtzAxf@%Pjc)fg&x#+!5e*)i_J=M4S`yuJtK)4O9?K*1Y1S zB=apE_KvsAb-WtWewC86AW2cy0Jo5xjyGOn4REYTve=|VMZ|qWAJ5nk4UM{)=>DMnAuRy~Z8g3$q-j>XzN=vSc_s(|zILg>A;#uer z?##9@rxUFa%r~ym5D&=EQ3P;6Cl;CM6mugOgHep>_k}{CqRuQN^EHo3nZ%7v6qL1# z?%>ORzht}ujr1_q9ljB{6OB7yyfECqHN8~koE?Phj}M!{d@;M&>>294JyXf~^8+I> z3%Mli9wVQpF7S*PNX z`8tJck?O_b&p}a*@Z)XvHb}tMOOM*W6LO10dQqCinPBIDc{U`ldcxOK9f(^RwTI8C>E@yGP;5Gs6XG`hX1CLo?*btUeHk+W>oiKoxSYwJ5ux zGuTkHZOD@OT1TU#Q+zutrnb)}8E?EdPfTwJ{dLXsFQZw8O z{a`@PwYj--1EZnuy_%N?!+r!3^PFtY!hG$7O!uIQw|G3uR7U0v9szCxJn_IIuERK6 z^zrz3KF-H+@GbPe2FZN;kC`i zUR+&3sXBG))WD|x&=uJhJ{5TH%yCv!($yb{IoS^L{5=_$5l0c2G-;An-xRzWU&`N$ zhApcl+3q~Xgk=}sugd12aM!=TNBSoFJ(4FkmQV4PuM;iA1Fn+s=lf1 zmGqL?bGWV$Ho-UZ5_o}^V}UNsds+6U#U1(<{3icPq|70zFPbjWOh>uJ`3KV99Q-y^ z5Wi0KjU5}B50q!Jed1Ldmhp~8S-S-01)ssg|1a8P3}U( z@8M>6LHqgusv7VklKBo8I(IzS)zvkGvDjZBet{XJE_~tN)m38IAL3Nk1@em9`6sU!N!&cx#?h``X`xhp^$c{o$$8L=HRQnkGXozc&oFVv|uh*NpYwe z4v0j_A~4?Zyx(ACOR0DCg#*EW>(?Qf)*8?1WU2j_pv-krn3|<7s1=s_XNF!&<;Yku z5U9e*(`Z*S+J^pqo5ucRI6o=?F_QTf0A#0{;Jyeb)%lOdbkPWuh0~__L((N-0u{d7 z4^|)x@sXOqv%q-&K6F&gxdZj+wtB@I--OB`;di0$V}cB)?1Y??Acs=`BMEgt5_hT& z8OsL(GaB3f4?l=Mh`OM^Ke?kla;8W-8A|5sB)SgQ%_0#$ek{W*%~0|;oT&SVC9i@L z!Ju#A_RSwZ0ZH{R4g(nP+oWprIkmNcb|^`^@dqzTf-rcpieQ%F!u1I zNr9x_fNamfpKM%3kt{%aHUE3%m7Wp!s2-Xuo`^6uY==EhrU#Dz-xvc2qBn?|IQDvE zW1~nr6-wso6tWK2iv=IPV`_w@87v@4Jz^XYNV}ZYv2Mf-%o9Y|e`)!qA zMsW7*z;3!uzh&f6`hTczk@rtZ;~xx2$uIz0vh;=R&yajcuX97AB* zv}rr=G5rA!;>5%TzZ^z}VE5av5-K1^+Lg@L5m=qAbas2=Y1poMx}_CLo{=Y0#1S!U zc^Nf)7IVf6DdyJt=1`jjuK^@jID9T%OT)i{Vw`r-cO{fYFR+yA#kNS$gjAx zX=1!B`M?4nJg6M_g;|E8{5c-){XfhaPBNEJD1FJC&KNhjsLXg@W;%ud^?I+w$2z&0 z1+&a~bK4pcr@-X5S;>6!i>KqYty=l-6YWwjek@PUXlQ%g@y1J_f$q(p+zW}tV0a0M zH!qAg;e7jx90CZ_j~XUQec-|Mz3@)Ek`CD zK)^WuiF)dU`T?Wi32V&QMxO8HB=gNTq7K%_rC!RkCVLZ}Dsmg%S3_52zgJ1=;4I${ zz<4j?eTI%V098X?81Fsz1aPE}_Dj@V#p_m8CBVbJuJG2#1Psa^sdDu3+S=N1D&4Qx zGyN(p3lA6pUvujnlq$oUA6I>p#eM>yr8A8&L=AqfwgBIi?y>6c!kS zG?l8Tgkw1F{aP4+{FF0Zu*a83)u`rt%>beCRjMja%{6;%gkHCzMNj(0;y@jznSRHM?%2eXO7-YMAiPKz^jeuu8u@4%|Z z?PW5h!`mVk2{ncBB(^kXJLMwN^Y%PUgI}}f$+WWwOgyN@hZbV-R@i}SKZwN54D-~i zWWIR<(&=iVOq+_ASrzTzs|?#!*39h_MP8+a+*uA7BV1_$3T{%S(#vvla+4cUi478G zBui8FXf8Y2czH^6L-#D6Wu)&78KP{aQ1Mp1pDADFIfnorYX~??lPyrznor9I$$XQ@ zx;Mh5UQDYkQZJ^$kJL4VD%NG>oYuzihC_4$SDM&ojlV%}qOku0BHMcog2`Ia#bA{u zcmK#~77LWFvDcq_n*MkEc(N(?^0H*U5Woo~O~&twSIF4zVV%v(TiyApNY>*V0x_b& z^&!`1#xE?~tglv7D>D{1Drd8fd!%^jEtYzOO|%r{Xu!FYKGA@yFH>L`6@^ez)}Kj$sdb4u!2 z#3=o!dSul`OP)-W6au^LUGrO5oR}8h^jXFXXAr%R%y$srIsGW4-qX>>*DdWRO49bw zl3(VD#O^h&d+Ua4$aDQO+wXO%%b{7#heGO2;ZE5vr%ag=&9yy^Q8?g9U69n*^~ZU; zU0bXn5P>XLKbi3Hxz;>t}a-OfLEhT|CLTxlCsztjyRN~25<-nhV)thBO26X4V z*e2SQq5BZP;@b-t+lK{Z;KG)17Wuh`lKJL}RDsL!lPa*vdW^YQ*c{F~<&;3e=-Yg! zJmZ}4E?d&G5FenM(8F$=dv*?9fF(}$7MnJ{hsC_4M1%Mfbz9rH*g^XNoEBEjE z=GK+Ayk3)1^6eF#qTi&|-rJ$aH)f#tycOH2QmsX21j% z*EhC4Ceq~`E?gL>jxc==`aj<2Cr$_^)uS$Uy3f2_Mp<%7$t6}s6>MG3{hAZ>9f{}S zbSECeOPZ4>Gjl6a@-?I3zIejv&L88HF;D!2lm62wnQ#9wbMy&rybX!ZZ*BxRV5c?vG8^?|LyjdHpzVZi3R)|g??E$aN6*7 zOm0W!X-K_m$L&`0CXdUjyb$osZ21qycZJa_oK~`9%g6n;e2~o7MZXEY;6s>po0%lc zejlPVVAvL#^UEpDjn!AL4BrEZH|equ?jh$H`|1nOQFuq&51jHmb5n4l(=T;FXG_z* zc>b&AI>3w{s`;#$ymq7ENK=z12h2 z`$W1NLw+>U#`SBssv*gEAv3;3y`yIqs=qqZ`0+zkqCMdL`ZRy&9g)r@)QQG7W*8?M zi}Kk1yhg-x4GoJ6LUzivyewBd1STAK#3T5vBxW}B(8LEgXM!a2%^51^3u;WOHMauk zZ)&-3C0@Pq_4_Q0my(S*8Tcxvwhf!{<#EG73T2iBJxnM6Y%kY$nw?-T0|&$XRu|oI z$Ji?1ag@C-nRXunlO|2lU`W`?DZ`iz(}>nDizLZ>T{5Oz;=wdE$$Y_h7u7chf8`SL z@=gZtZ?MIGKYp~v;u>!NQUqUX3e`8Y{WI?kW~ngTz5YB%y$2+3Q~Jq}QG3iurVAX2 zZT~oFFF02*sYdIoZ<6`;g-`+R$71YzDU}0T<7;exx&X}C8*r{~b~Huz_@yl&2SIzDI4Z98L{Xjy1e6E9GI+cvhBQZmN;93n397^dE%?TQ^1CgzLT3#Bg?x=*sb;vv9i zZAujh;-kAyM@PqqTqu#uHy4x~EJI`A21vbzeJG^Bs7jPa9c->BRWDl-xrFHi(!J@T z^mFWBJKf(D>eprlQw@WP;%Ilvjy8zji~8g6ytl@&6FdqJYy)ECnd()D@>HgLnddG9 zsAt^oxzA}D-FQYWlu72B3rY@^;W|)AgURi*kiUK&cUk99gGHzoC%s;R0|$R^ZyFmA zX4N&eFSq9xA`O}*1NUL6wqRXC)Rt7bpD|V2t)EY1Ee;EbG|L9Oq&-<5kT+a|z>FC) zwqpjg6sm{ob0JDH-&{~~s!TKtr`>FLvid4-(dk7>j}7Kn6=<4nE(&Sok^VSLH$Q8{HFlgWZ@}BZciyw+VOw1?SwaX@D|#yi|LFPi z10(w)TQc9i5OU}~9~>M2UgG?0zfoV;%U&XBMweGEvg!NK( zDkSsGj*X*tx;Gm>d-oKLs0zD7#LFREz2bFjUNGCx@W~Izju_{?#L$a5fN3sOZu<1yYq5r6EyN9aePr1+|%akVq)gJwCRA-m**iA=CAwA5fza${sEkpS{|S z@Qs*q?JK~;ZaqKzHT3H^9`S5GLxV2D#)>*Iw_mS%$K4>(0}au?ZC|G*Xoo^FUppcWr`N3f_XJ3VC!3p5Rl3xiF4Z`WO+O#%hNOQ6s2UpFz<7hR zd|x%wW#CI`Q$AE67|TP<<27WI_jz+##w!y6SkSD;hKn>eqh>TD^EIPUfVfUI`{}l; zJvBQQ#ojfa-Mzite;_t4+$GcvkO&*=Tp;oCK4Q1oG(nfqXB^5o@u0(hotOSIvVW=` z`ofJj-nuoF&QA=E+7Hi9O|q6kA>g43U)WVT+8Qh~BXj$b`I^xvKpafK?#=6oY15{` zu}tY+{1~2yMi%Qj*JINys)wFX-`E}m(}ViAxEJ>+0U}N}Q-OuHJLAq!%(;V)L(-q` zM4xp+lWLs7Qz>gG1OmopDMr^YP5*he_9XMohKvLG+{Xcq1^=44YN{wj*^%UJ$$cp{{*1w)=qsIT(b<}fgD+Ps z|1+2nHZIU*H6y%d)i<~P&)HUUubneTU8j@GPvBIDTk}QZy!J+nB|DPWj|IF%g*f8g z+!q;lHv;%g7`qkWwQyD@N+t8ngo;CX(HK3@rzLI+orHOz_m486Z zhz*d|*{? zp+Z%zn=~m95qX0RwQ%fUUI!$c^&=Gh#lhCcpU%gj0R*N%uo zX?*#{VESP;J9&SaKlF}6%{f*r-zf$i*3-rU8&ht;;@T3&8XHW-G!_t#bKUwd54CP= z5jO`BVn&Ir%Cx7-Lmn7JHGMt`dr|8yx;~$f1>AxF?0vcoT6BG~ohHeA?T9#(R%7&7 zNMFe^)^fI~pH4mFII^FWd<)u$LKZTJQOEeB8Wlur}JUfyvQHXY5x$ls6AU6b<4WS4*|;PT}H1M89FBzK2r2h zGGBp&Lm}KO^1~GkV|>vBjOJav*v37SX+&FukH)#sYW~B$nZPMgBPWrj~wUyS3Xt8# z0H&a+Dlt|-9cpq=)poS{zY~p-m%+rwCm65mwhd?PAZ0ft?=kw5WIWH2&Fj|)FKMwU zQvz{2&OEmvt1BM_{LQV4@!mt0!A03oqGyu%3M3p0>B;7*!HD0R?hpOLoadG}whVtu z*U2xjc>pIq(0`2Zkh7;v3v6%;eZA2E&r#ft8onIj7pr3PL_FW2Mkt@n;5{OF-7%Mq zPOO;!CEGojUm6Htqs(UsN)HeeOXe$(a3JhxY5D@HH96ZxiqfR%i33dzs7mhrE7a0( zBbY99q^ue`-hXn&>*$u*0a2R$Ax$IGLA%E&LFofdE@)y~aolWBAxpDUG_X56Hh%(D zkM+`t(qqk$sZv7#CW#4EBlQZ}CG!mcJF9yxQ{V?K4)HFyvwVH zdCx2*#ye+DU?)uKP8LI`Dl~7N%S4^#5Z|h}x=m|$&0F8!p z&Lv*vVP`cV za(0BDOEO+U=lPn0Q~ObcD~fU{V!Ot^%fE4aF_Tc$SvQ?byNq`n>n9N>0Pv35<>T0< zkDaJm)>%vh7)c%#rfDkJfZSx^*l_4Qku&h28)0t%cBtps^NNvy6*8dVfea*j|Hyl zU2eXg_c#+f6{+q~oYCEZCCd?)Kc!% z23wj=h9vbnQ09empKUNyy?9zv+ef+A;GU9qw)@8c;95=*`m#x>u4#0hI(CrJ9%cW! zr6Ks16YcS1j?f=Rq-rPVm30mP0Y)JBNai~L-_{K<+rp29(UtjsRmP@|zIA-sho|d^rkvzPw)2 zgO_xD#%d{9lQBQfnF|*#RKa+E1_u0n3^vq=w6(rDRPv0sv#sF}{CuzBaN&cp-BVLt z5UE#VbZ6i=gevsUc^)cRU47Cuk_Ds zPuRj1w6l3{jPJ_B&ax}x71Qn!DsvP|a|eLq0pRxU6yAj-eT(mUGt(mDrHO!AQ~eZ# zp=ewnlFZkUQS9vO9EOShKK9CCe)gaG=GOn&^YfMFPSA1Q+I^4-Z-A7x4jTckf{yo* zeAQp+N{>p6wKma-AB={8P4Suy2k0=zxn1^ zm9dWxaJn5?e~}QNgxhg{iT|FZ+M$4BK&wYSnD6F-Ee!!oWZndGxQa90_>{(TWjQfk zj@f6Qgbzm_vDgOYOUN$^rR(MG5vF51Hm)kKQoK|UhIG1z`OT#=6TOIv*kbeglFi`p?Nb=>gebg!6F;6(yw2| zg~2$TG{&fMmA|>=^?cIphPpJJ7-l-07Dc6b|A=Ml0D9SL$U2Vpq z%6=HrY~;xN5=Q{4$I&<~#&};VmiczJH0>KB+D~Bvd#27!u@V;;+4t~e!qhgZg<5WJut@IPgH3Vl!F?g3HuS^nrmmW@z!1-*ra8$cGq-J)Zy2jhavRyGEjnzn@ z@RRMNBY>aOa-JLhJUpx-nJ<@|AVCe-MiejQq>by>T$2uMJcsKE@zU;*?fyXX-+e0# z+gi3->$d@M{s`+faVj?Af>Nj*b})%rRSgVEXjHXNBtJ9l?g3FxE$5b$Wwd z<=s@Mc4a%|k3e^fo>`I1mk;exC)6}hqFe!{WqgVHJKT^yr*U(UOwVoELWbj|-Pe2% z%y{XIH{Lj4eK9`?8FdD1b*_X-+4M=1CTZsUvWiE;#<4xTPhf(@*B~6#($vLlY0G{u!9%n<6h6QYBcWs1x8!#gF^ZWWI<{cDaa}Lw@WF zbUSaD^9CB{jJJEUvA}k`r~|b8cI>VbPW1T#?~A;03_IH+hv9=`Y&d{9%4gJDUGQzT zz&c#5vd0UBFErd|6{;=SPT>)V(S+KNvU4k*`EvLZ=3|S^LxGol{n%Z`-^w>Dz$r}# zcF|YkMYK%sVGn%V-2CU0@c;lo07*naRJMlQ&Cg|xGjbYVFw1&B%Z=@u!*Gy8P7t?2 z!+l4eP;>2+DO1uv2BJ`zgqqIUL{}KjOe>_mH`y+u|90xs3eko&QA$2CGdQRy{=j`Dpj2{U3x{Nmj_CR|mgll0upaim z%P%LfEVnaeceMC!MRQk+o6vc7O+4s`Yq-ipmEQen6X zhkg(`FO^gH7}PHNY1=pKR``}>i%uY*GkPws7XaTtAiw)89+k{DfiRe3lBveC;wOzJ zZW}>eQ|OO4GkmT&4d!6N)o;^DNNl&6Wy3Ajx( ztS?Z<`_(Erjay}8DqpU7dQbQ!;A-eOxta%7B5+1h*o&JHVv2ZvVOzy7OQEX^-KK1> zFbEjxLWOM-tx4vaKmyx`4-}yE7IM`Am|{{&hPt8(j3V7fh7$KyFY3Ze^_GZd3b~H6 zJ4RNZu9NtEFr6q>)JqGy*prE}!shV>Ow2m4nIv&B@<}m#5`Xp?83_vbPQ`*=Lq_iU zuYxux8!IRRc(3mj4NB&lK!PfI(v(Yp(@SX5-O~Tn*9SiArL1v_san#$&tA5pt>F}1 z(-#Rx;RfR+bV_|w+du7P%RL>|xYW-$<1;Q;IXL12S=65`jP*n~9a+aL+jE1>4~I9H zY`F(pn|$7F2q@K_rFuKy6y-aUh{bP@3G0l%mPm7(cDU+yXwt z+dYcjW6qQD(nWwpwS5Hj>CZM@rtFP@LKSLhfNoQt166}+ z)IsSdIOFXUUvulqf!0~d&xLjU7JP)piavnx+%wG|dPk%S6^7ax&SsiEjHq;bq1w#e z4t;xSV|z~P5!vgIcif18lzcHNvs?V`w)6y|9a zX>$nc28(T`qS5or_aX7tLW=GZj29B`UpzHq55O6370Gz#%n87W4ZYTkk)o2{nbQS} zf-_Cab(VE8Y2NMMUOI| zq)9U(qIji+v7YeNLg&r<4(9jz48M0tT|?oKU^?Z_d{`xHW+^Y-$B*wZa5($NmGF%9Iv$>ge}M0@RQuu0?gWGop@_gQ(q2$rL*C{J38UmQzB`tX>Qs;}K*2m~N{rNyfpfASD>%=Aj z!&Wz~ewJdqoWay6^twVGiIYZ;3o=5X&?x*MZ!?oZ74p=SDS>WtUZLWUNscwzqO>UX zwv_6Yk2cm0@KcdjIUvA)>rkidA^K90%$G)$v+=@@79ivn(r%o1H>o#?SJZ+#i6&vzybPHw><_#{ihcM5ajJOpb6U^>x-b~G~s-i5m!GQcL1?Z-1q6i2~ zoH%hCW>xWxJNT$pCi7MJCbfQClzyaAcB#K9^k9xposq2^zI+-X>bGY!w*OzYGL@1a zn$`Rz=+mwK6DWDHSQa0_S-!7g33Lp{f9ll0rb-zH+0VitU^vC1)KD>EJ-TjTu`pq5dpuh3Oeej=(TyU$euiuIjBF^2hCHy@s zy1}d{zG(uCaWN#HHE`>=JG?~$I9b$d=#@8+AQbl>vEwo5F28Y0#FNeoqre`d; zLFMp^ie$bV!rI!v76kKfPT2gr?oE zr`YyF6_nO?^h|-2X|bCI^ZjG7{1rn5aws9u{rM0;9al691N=@duzZjRsc1`>a(X3p z=*oD)wT_g2e$)xxM=F;2ia|}-lICq?ebZ-=%zhsfdPiMT<_#HwmP!dbI|I9DJw5g2 z{_>Ag^{|@z&3OflQ|$zN42-n^9CG;gg0^jKtRmN8tPqrEDyFF;6sqwJM9cXG9y!o? z9%o3sDZdY*Zdf;r+1bl?w8;wy#Cm!r+ofLE)wUOFk;{BaGIadQ_ul{TjJp=);7J7X zfGyO4?F6g^j`LWhBP!Ox`MJ*F)OZ8RKuK7GT%Xr z?t%q@p}|=VKZAW}2bfA$Eb&qw$M!a()vl|AWt zT}I~?6ADh(suf`{Uwjt$lANIC=*8Nu(50LLMM&&;i~Ycgkz0N z#mhQR*Q^!}yt9}zJXvgC#*}JiW>>UDKhxkF3xCNpWsfK0GE%Y#aNBmQ`|JVE`}r`H zVtpunN#-k%=q2n5`u1PFGW;@x-HSyrNN!72g`QPc7g#Iu%wdADye{(a>`t#P4OYvc+`|vxgmgq zch6*s@(g@<|J&R*$$ZUd*y6KWn)VAv$tJ41o-9(SL z3F-G6bLR#|i*&h%MqjBu)80=kQg#-5yY;7^avS&52^w%Cy4g-=LDP&`bm9eVTQ=q> z0t(TOQwY7s-iTzrc0>|s7`#u^SnL@jC#H(f7u1?@7o^g?#}b zzr12STO*fbka|C&)uZk$c2D1ZD|(2DhSei?8o$z%E_f6;C04@=-n4AcK?Lq>_m9&V zJtI>SlKEypWln43cp9UB!!0x8T`_W;_q@US{KeUEH)UwDEVE$mTx;(bq~0KRW^d0g zbQ;DM`=#2XANAL1e(G80VhW`h*Atv;QP!Mq1iBf6eZOVnOtJ~de3MYbBOM)qy<)od zqFL1dhOq%gpx??x)nKcR8v;Z1V7b0Hr`>OuH(7GaEYZ>Gy9(0Wq1JBEZB>*$X3cX! ziWb#>uK@s7!;Ss^W;7IqylhU(cgVSo6v!6>5#0B&9TQwW+ld>WY+3SaR#L zD+rQA^b3I=jU5g(Bll&}lgVL|WWEVFVP|=a)g`3CczwR6;GBLTZY#|F)M7?y@m~Px zca$E9lZwZKC$N)F`cw_P${#`Oo~AjJGB_C$~K)rfNw5>)O(3hJ06FN z-D?v#K#+*mK+AhIB;MJX02@eNm8yLR=A&7kWv5xw<&oO1(HrrpAD7k}MqjC~4}6%G zUxW-Scpi*qjo&`Td%aq-v&Hzm2qf|C*8eO^jY{i{R$FqH3tbh*A@&NU2~{!X8V0Sov&-UKlBgOW!;oP*|@oF4K^2o`D(E^ z^>ClPV7%YV*t$x8k}c9u_~T7%HLV#r845A;3pqOA(-uO*(`CtQ@{Bs#dF5 zMt~A2iyvhqM%lIn+#Li8DyYp=Pvpzc?Fc}st=2h_ zl9KB;${hk z@NsOi6iIae%y*t!HZgD;YpC>3=hT6&C?zVNmALa_=1o5CBvDe8839Ig!-XtnG#_Rz zQ9%oC-1amy%YPfu^@lO#91;hFZo^9Vx&G$X#ZF1Q0{9sX!I$yUzc128sI$p;-4z&a z$t%%BS65&xjs^U#oqRC37vb;tXZHMJrEzS~kr;n&{=@aGD}6It-Yga^=5~RTdp`nZ zLS;NB1n?W%lNZFezL4#KijaKgwKeS8(c1F{^t&^~iw9V?sUC8kuc__DY~bWQA8shk zbzNWzT%MXRRGNgG)&A7142gjka^fTh|cG`c)ET^BHEqd`DCw6zTc^zV;jLjmB(kJUC%*^gJEhQ!m$$pmen%{a6u64V z#Rcj2?9O1gq;;1x}5<8&eu|b7} z-I3N}L_e8V3_gSX0(1?JXS&Lo`G#5>PXcqh!P0R`I_jH3zqjNSC^2c$ByQ`w%-$T1 z+4%`SrpTUODru}+-D;;5QsuJ~_CNd;dw#Lg*hq#o5dLB?Hg#%XQ?cnWw~u%AS#w&( zOBDfjQiAdoaG?Hw*?Lj7%opCt_8?6ADV9}+COZ$ti_H`MWV#3ukB7~(Hw&rp@Kq}! zSKISTA>G;5a0otd7n*71Y8_VPy)N;wG2beB9Cp1R*QFQ51|NBqGy-uUiXF>4!HCl9 znnGDJU%tS`B0YZtQ~Qz_Nl0R|eN8nD#hwON58LD4U`}rfx>6y#CKWH8SWHhXUV!gwv)7E&*^_3m- z<-P@abp1In-sHJqIIOX=zS--iG>~=W(F}@{@!Hzju%goMr?CCS8?MdaSW}`; zvk*w>N5e){&$i^bB%x^0-SE2@<`Qz!$QZrLgug%`lG6K9GOS!o5z{(bt{IG;X>^U! zXEJ#*Gk3eJm@j7r+hgQ;GvkFc(N)(Jtak(B6~KFHc7D~q@MheLgpU(tN+Ilyv3u;y z3<}k@+S)(~Y?Y;tx((yBjVO%m*}W+;!EJO(dNL!xRSwW<#od7Buye9(m@hZi>efgL zG%a5g&~V1N*s2CWj|dH40+c`>ZM`6tUt-O1Zz`_qrN&M%K_%2F&*N}W*KDW| zbR|Qknei?9EGS8rjFbfe@4pj155kDouyP>k%YAjKY?yCHOXS5|YhY%JgNZDjzI?9La3 zTtZ@2GJ><34+pQcNWCcha$R%qiL4J5`No_%ft|1ra$t%K_iV68(S#UfuG#oup-U!u zMSyQKLV@^hRZ5@iwQ%BQ$ur;E+x-W^r0fBR`-Z;;&Uovaf>U$pd@;*#)dSQa*wvJ} z(1$7-i&)IIQg_`GqcqmUr&lS8XoOPrrc~N}F&2*O2FZ6PL78W0^|p)Dmt99SJO+R=aek-<&MZ<%gcQ>7$#+qco$DR`0(j&vT8_&+nO_K+=N%srC)l) z?3O0ODTlSRJ5o5Q0*8#IPO9L#-fERV|LgLLkqLrAcr$z3t-lj&z&l9LDZ|o10ONKo zrxxe7J$FMx@bg?KFInarHreuW23Dge-*V%Qs@?{{FfC&CMJiUqsCI5|&lT$)KdAzC zy;jMbl6$84L+=!errg_2Zux|C!gA8d^M^vAQRX~1 z#MgfqJ_ooTV5X%?p&jP58{@Ov{l`NxNqMznWQ`l?ri(LR`=+EccVYij5Z3upfj)cr~ zVPR-D7TtV@Ij`7p{xN`IY|^46LC@W{Gz4>6Tqq{}T8y6*4NEv&Tuq`^jefb9^vJf$ z3jtl%ZpJTb+mKP-eyulo_^ymqE3_{uG%kyWeg_!)PDhLXPxZ~g#|i{- zug$RN7VKGPJBvbC7+O1GT#ak(s0E!176gWFTpvCcznl2Q=rcZ_Z?m}X_7G%`>*(kh zQD0wgnH{;E?n}WSt=|$xr z@tSd}8dd`}0}k^vUwHAbWxAu-Ctc9krt{1e4w$+ z@zze76o{De+#A)&S%NsIs@^>MM0KS|%TOXP7zAQH8-GZtK8)(hg9E8YF%FQd@_q$B z%-{A=!D<|d@m)Se?xWnGZ(~tyibi7p0@M9=sxpJ|P6FeF+r9FdC#vx4~$(VL) zv83N%&^LzZ^s+eOG@>Z#6@~Z>nJ_~d@TWt7pLSX z-H>2s4}>BQe~3H2#>+jqqb2+q)C*}o5nxCAu1eI?W;C|GJCH{D^Nex4J7LXWybuKA z7hhb%QuXC^jqOi~ymAb=arW$1-;Xp#?zi-hk<;b`!$%@-nI*rM1Ye{f(M{I^$K#cjq-EwA@d4IwcF+lKYwiRYei`%X49p`*3& z81%t)#~USSl#!A^VB@-QBThxxMFn4)(L6A|XfR*iTfXB53oElobn`dyvhHq108`)4 zjF|&#z+6vX@s1YXtziBZ#>*Giz<4i%)0VPkyaJn@_pEt%?FV6OT44K&JRahlx62Xa z%zRmN(^S|C-)62O8E0e<)W{bK{K1x{GhwQi)z1MoRa^$O12@r%GUl0?tfK@Xnr8dx zFt)4C^d;YnP?*MKD+59Rv%!gY8?J;uliTW= z+6K&*z}He&+ysZ-o66xa8mUngSVTB`agz<5(dlN1m$wZQj!P?r@p6PGPY(2`3Tv{X zjctpU;Uk!w4cPCerFBx3mY0^nJRU{QszwUUYPcHXVVL$|BJymXQgT}T#m=BsiQgcH zdBEI8_wqL_!gQGZ73TupY&BN|XpFm4rOTnR9gw;~v}0kuJQFtPcGW^`YGeB+ z;sGZ@&Ul%wLzm0q(_5nv&Uo2{mBe@jwslRRKSI^>Cy_Qn#>UTXYq-&p*IS}`)EtcF z%aB;Jh*4N3P{@yPy6V3(Rb%LwY5a&F)S*%4@ntIK);zu;2Q#<}20|Y`H17@MxxqdI z%O23nlC#ml)+e9OKLXtE3S-J4s!Cf1TYDPwg><#tTq%ylurp)x?Lo&AM#>P~O&bgd zeaoZLBkP(&e;srKC0c>v_1*|E*SgPNXSZ-qZn<=c&tm*DgIw~PXD^R>*J5Yj8TCz} zpWE+=G@R3a3sTsjJPm>SZ8(Sim$+97;fJf1@7r%WDg7MN{?L909hQb!zv~*n1F#j% z@}V3+W%3b65YS01piuq4*@TwrTtNK9KymGr(KV=CC+DD;h|Vt2?>c*#eJ zXCY;+^eW1!`RJUevlPR+y1-g2iuuj7r_{8=qL8&UpH*j?t9|&6MwNBG*Ot9q==F+ zw;{lNgkT;y#-l2Y`JgS0`PSD5J~Y3q`Dnym2R8u?ppl^Fw3{ibWlbvg)VlIIj^_4-yZ_wrKv>$Mm{ji3wwEJ)yhbNkb1Gw_5_^M z|E`zfuDpd0)v1Co&UmpnavVPDU*wiMB431f37WBu`&ayAX^~fqP)feVKs~oM`AHR; zM&`oQ^7r}jXOhNzxqCFI39%aP%muyW5nDbFKZTWeD6LHwrJukN@l|yV!52)q6&;;2 zCD0AV*=Ys=3q>9rnH1>i>PneVnG5t0hk1;b+bCu$))n-@%7Xov-v{I0Iz*w@Wh?9Q z{N!v3m@TYtZv7wss6(_&Go|GIOkC2J&A?o$9$YdQW->OeUvn??Z+m!#Ry}RbA51(= zxv!|27aTr^mGb^K+tKR3S=aShcyfgKY5d^+?xD)JrZ=^{XU>;#LWYjo{kPjUeasiT zCJHCN3J5hTto%`K1$rp{BoBQLr zmE=yiL$iE1?uX3*<47`IFkKD1=IW0=q2}w7@n*)7PlZ(vy#{h9{Op)7b_M(z|InP1 zHBMF74flPmeminHURrE7H?GxCdo1kxDR-8D0C|9*4D1Ipi1@`#`4F>U$+QHc5`~Hy zvb2RwMe8}QFOr~&`WAMEzRpMtPO?Zl-m|I3A7M8`mR`hkZHf3JQTm@yix2kB&4>0? zZ>8U#KYzX#xIKj5(C*b$?4Eq$k;Z&=qIhi$Txd)8E6Js)wI7E6jF6$J{<3)E{>(H)4OK3`1^b@Q%_}&aa9CVK?_6)=tiH6t354XWUVn~lrGpwKC_rBa4^%`L(^mf0X_o(eeYc#I2w0AN|65sv}rh6wN`tKp!90sYm zI(ai|1gE7if<8x|xwG9rE^9G)gY;}&_g(ykz_;jU`985FjrqdO*{f#EVP<4unC5$Z z#krCi)7VF#D>=tnmSR~~`6jn#wdTuIHU#*=p!fmUcDOOvCb3_2v^0Go+jF_*Z~Qbo z9pkhoFOTWf2DQf9n(MQ!D@S9;hz~tx-CCdLwK00JTl#<({X;t~l(D_+c|6_K6&MTG zukdlruFrJ8uRI7uyKL3y)qRz#)cY{&+yjo%=K&|Y5~N>NM!8^?#(bd$NE`fIDzca1 z3pX3rzxxjmezx@J0&dfmC^~#LM}yxg4gLyD%1eelUk_@(mSW zR+q->MxVzEok6EW?}l;z@AJ`4 zrPaY5AmQ}KpV<4&B^Zrp^X>V$raRgrXG7{W7O0?ye+(bS`*MBOjpeXIExB%ieG9AX zG(sP|#GHpROU-Gw$2nt&cK39EEvGT%e$&qAZvB3y0v-spggO5*@WrP7In$<1+mY{X zr!ikmvl@EiWqhkYjbe-B?Ct1VGl(-YHHx~+;TwF9cdB@b~^Mu>`|I61(HVS`ATz89*9&&=fFCHvHzE@s@T zk{9gl3-5#F<(!5;PK)F1f~=FKI4+?KtsPNnAreMe%A#aIV` zCScKi2i44VqAgEar4){{Axeo0`XrdX^nLMF!S2nU+yx!v)p$PLR1K#ez-;P!lP1+h z`mRlu$c6wvZhkhrzu^=47?_4vltUcaP5qbfu7B{188fzvd>&%w)Hh5+YQ9DIEF}^n zVA{L_@|#6n`&XbVmlTqUn-?8M0J*?Mu5rR|A{{EvkZH5R%a+9 zL?%Lv-B#>~?4s49ZqIPu8&_rgoSRHxM=2KOQWi(1Dr7W)KrBjr0Ua;-|Mso~PKx5{ zS9QNi=vt5}7&jdxT~ zP>G7S5hIrn6%cm0XLe`0zWfdj7y6&o1-T!{|>eYLS!ryl&?51ov z4v64<@r4|0oCO|tGpC{2bU2)d_X#57xC%D}n-{O(*|^<2 z0s!nFzjw=1QkS~1v z&j|G&jIX6aJh+-@-=a{_JpKxQtj2u-QXMnvD8n28SAS7fRqwU{-f(r$BYsXW$N2=* z;YfL7pup_q+9@K6PwXRD-Ve@Z8 zZ3Q-J_}h4#E#$lOqO&!C+g0o`&I*ZPoXr-(*xO%Yw6p~(S2C61|YL0M{Q=TnY{;6 zANgyaO{bQYfdPC7H%9icJ%+Ogz3yTqIe9LOD!svrOAdiCp{wyER7Gv8izPHvmtTV+ zj{^O7GDpb_$jqGN`e)dIW3*Nz zKpx12b(dhaQMV$QK2|L6L$na);B`Gkv{9@FfOtQZxq|^MqRj*Py1VlIJ@>f!Zv8>r z8+?zjVd-+YzsWW|6z1n|qQT?!53NusM3A_f% zWF-KHqR#_9A-7HA63Xw>C~b?PIX*4f=nAF3$nY&5UIFVfz)jGm%wXD=d4F!9vf3_1cIYIQZ1YBa>S`Z@6`kO^_k|r07RE= z+2H*&;1_Q~FZOP5iT5P(fx;^94CMXYsujM+NQKf|Z??j)NpS>x^~D5VylXF*+jj(f z1v`J{vE(`oZ`M~p>^2hZDlnw*t6X~LcEED~Bpgk>OXJ!vAWql|^4z}o{W)I9A&C$I zGN=1^v9s1iM%XiOk(v~5-7?r*RVx=>tBpIdAF+M&nuE;wDXcl1Tl88Mw4~d;)wh2r zuNF{Sr&ZM&6xc&)1^3Ib(S%7$<>FkMh;sL*QZ`x{*qa!xssV_qu!Ai4P@%Cyid6Ap zw0VqHPpo+8JCD&?nri@-$6Dn>{E1faiN*ndFNE&;!`N(|5h~+XoIB#l1I?S$p|V!L zt(tNuJVYOgx$6(-0+-_5Giaum%fQ(MzO%-O_P2Sqt~2MA;rJAQ701g!O~9CPQ=14c zewj?=z(ht~v{c)&v1I}tiaYQCTjpsi#g{3Ha+AIWSXUYYgMDE=Q)3$MgBN|YJ}%;3 zut=E80lht7+V>s{!wJ&Oq;MlN@xfAuiq$=93Ddi1c z==)>7CtY2rjNJy#Z@M`xnJ|_>(@U#7HRdw3A*%$`sDOaE+7}uxx@E996ofwn^&E(~ z34*V_PCVt9N3jNv)kpZ<6ExpXpC0|5F9PN?h0@XP`6z!_D4eTbJcm2cR!l|%;~t*L z{@^*T1(OuiN`Ou5D>O~~MPEN?_|1U*N`|+a%hV^y`xq1;dnf@B_zZLUh^wmMniyHL zl7xASeqq=3d3=d~7Cnbd2~vPI85w&IAicVcBFbufEk5kow`lo|H@bJwK?mQ?)*}H@ z7XUDZBF`}ZpZ>VV24;+lb2<&U0cQf11JKq$ksg}@;j1@JD~&@v=U|TZ4#%c=g9i|J z#=gd1XjwZ3G58R20I!hC+7#)UnPPk?oS_Cc3Fpjl?*+RKnvxEoVX{twZ%iT~5#Ay2pS%aOUmD^V+rC+A#s@6m z-R=W-uJ_JN**bEEN=5tzwMy82$cY_#DiW}e2PRNYM8%O2?JFD--TQTq{*fqo;J|WA-@S?#>+^5l&h`j+ zz31rTg7<C#2pFRu1MSd3iI8V{X%lk+Qz{UmHt{w}~-leXpI;h&_=Zf=iGdApy5jU(_ z@nHZD#x*p-p>E;KyXZAX*$mlRXXmDeX#)Ehnk<}#fz z+%9go5+Clb`VJ`07X}1LYnZ%P#3m(nKe#r41q|)*k9l$NKZ0)NV-=EV2~&Vv-v}PK z2%DUf@lqK#_Y$rp(;4|<$Qn=2bgh}J&q@XgYgAXl-Ti}^Y$sQB&8l_%I-s#f4efE( z#Ebe2b-7&OqvUibgmjY)`GNs#A~Sv&faTec8z|B|0TTjlS%kGhz&!25lKW?g+JuN~ zr|@Ft)>iD>A~e#DxfCC;)1X$CplB>RV~xxChMJvPRl6GZ^Fm?yn^_9Drq_Io8b5}~ zz>(Hw`DZ)9?HiqmG-myMcQ-tbOxnR9w4Zofl#}hD}`kMRq&HV_P=*)RW#YCKUwJt$<#|! z<$8QvBlk_HMIrVB;GWw4X8>MG#*+7&Jrwp3-FJnn-Ou59wAbWN_-s3jS%R+}X2H}4 zGTuyv=ka1aW=cvbh}~Q;Mr{|#KZhN1)GbK^aL~$~xWy~9|G-S;)7Y#?>)=ADKbiXV z?`kgJeC}phRTl{H!g6(DaE$}fd3GROY|2f)cWG#9Se20MX&|@ER_8( zYv(2sts*=5xag^`7@h|rHkL=j1=t;Rmb8uo7h zy=U|BLgPd54A}eN4n9^2p9~v*^a%RMBtIH^l%Tm;w#54p)+~qXd*tX=G9Z>m>7X00 zZ5z&}gE!DZBE9pDOJOe*G3AM_r~^!1o?CbNKGGnDp;aG@P~F%;nEc*J@Xas%6KXvO zmP>qJLykAtSilIrlm->H(_tgv4Sha*@gK%1p3C%cXYLDR^{UociO?GH9R6~=Y zO5dS);H%Fo@7pLqYGq9u*N{P_-x1f#iuw%JYmDTZPEzcU?-@v-eeznNsw^SxhoGAn z&JFsD0NDTYRF{#~L-?YNy%I$kjfawK6-2!Vc(F`9A9}M#^Wr{zpqM@$9PdJG*+6av zJ>bxV(|F)O$wA}OkPUz6sV={qz%i7Zjo+0k{3q&wHycglrOrYZo#cgR7L9J*wEAKK z-e3|}tL+55)F%nP7L#VjsN+UCmjUejhQ4ueFfFSLaTMTeX=I$3N-nYW@xcQsiUI@V zK^qeKB>nwy1fnFipoiBry}pr*kU-o4Yg~>Ij4L$&@D-r~0D3UM;wTtcdJrZVmw9T- z${%>ZT@p&q`z+%G`Yc#;d62(7Iw0@%Pg{nK0_KvDJknZW`b<$)Q!m$3s}Z_XbI zgPYxF<(Dn{RQ^Ryhg2*`jP#>P^hL zSRkNUzr4cQNYb>*hH9_}uSD;%xQ0AiM@y#-F0zFRt6D!TRuq_dAt(9KQruMp>STQOz5@ldS5siX*!paH3ib8w*D`ROJXoVSmQ!u#57dA3qW} z>d9J~zx33)9%y&@ZdFZ;xWI`dUyhNXV3WYLN$^c|_F8v8(f(X)RfT$1P+XU4U0E(C z!8gkU&P4S%RdyJ%_w) z8x6Vw;Aq!cxl-3<3Eobt^1K9g@*Z zqCc%2`GtbIPY0mh)3S5Z=a38EM7qr#>Bc*YBH@JBPJFXWqKVJlHa)0X% zb4!f{tGuOE_0#bBTjodo{R07UXQCXcxqdQp=&GupFB~NYgTp4z*{HJP!pk!ha^c@E zTjKkwp~mHzIn({4j?}h0ZMSy>J_Qk49_D;9+%+gbS~Mb{VN*>z8iJ@N4jAs7SO_DR zUs#*Y5^Y9_mgd*cy-05f`aW~M3HY@VoMZTyu|ApZvGji&y1BnL$Hxmp6mTc%g)NB4 z3!wS>9=(6`vlzzEBMVu?1dr3c)p9YkgCS3bOCgi#!(b7pu)(i)#k7i|_bKeD^*skR zJi@(|!X1Z^tpkf@3MvX8(%<4ZEM(*zmptLtK@0sI0dGqHW-)~rfM@dK9x6Zcrt*f( z`aE{Mmz=L%XQp-QZ_u^az)O9H^UPSl3*>za<|Gq39cLy5jwC}oz%6CXX!DF++rGQm zTqfBtZHUkk5g&)+73Me;R>}T?&xO}e541cqEwF56t@kZYjqhzM;3cC%-05gClq=8x z+=mP;(5~mS+jNiuz`LjrftLs+!8fOmOEQhMAB#+;h$%0C^U;3&hg_jIs$VSsQ@J0esU03YToqZk8%zr1PF z+~0@t7WS_=+T8y@Ao4JcIZsy#ld7R6{I3Ie2jW?Q7>kWA6plAEPZE4Xl(66L_H?>L zzzf;eRTCym&~0obUn4!x-BorLE$OUE69B^y%xzX_sm29C7>0nHMdBv_;I|T~2j(*e z!xhsJiR86=9y*^L(Z^Qbxe;+)Gyi`QeE7Jf>{g_f&rIzo}?b?Q#vggfmZQ${Bkc(T&-y>A;`KCg(E?x#U8|K1< z%{gXmgfLmK0}XM>Yd9{vY*u~QDdG4G+(7o6&;}e7?%_h6o!h$0w-)l~S~?Y%f#ziu zIiNseW1~vC&i>uLwGh9+Kb`m<2iIkGuIw{N)Ctbwfj@4&(39}*ul=TQy_gKU~Sa`_W^Iuq|FekZY!Lv7fhMc zvtRMT8*g;)u*#E29JwbVFByisbjnY#h0uuj7Jz>V(g3hgWYS24O!!U64(IhB ztK7x|9fpeY6X=R!2QpXMywNuv2BhZW!`%}y?u8It-7~GCetCXoA^WXL@XbDryG#qS z>&uRZ-G_xSH4->q2l%_+!ITG+wK_!}3RbP~{u}i}|F`{ud>I>jqx2hmWMGK-*|lir zYAjp#4rMj}0)$uKgV5;gUVP`|$?h%Y^n8W88u*gaS9toV!@S}U4d13J;<{;-p2zY< z|6H{z!8ccw?jr4wzqaq!`~HYc64;W6p&rNhPAn-O=fzGp+5Sz0Jm7kN1+F*zQUW}I zoyvL9v>E`yeX;F!8g|O3p)aGNwSkvlBXn!uaHwqFv7?+HpR{~!W1p#y~r@6s= zE`e=^vDs3Uj5h!IikIM28& z?_m>dNH`z3-mBsy(RZp$Ts*b1;q|L^QMly`;Ihcl>Y6cMM~Ia51HP6U(5H)H zeHe%v1`t0456$<|ap#U2vVUgAUmhPN44rYDZTA zy#IvqYAB9=aqRdMS7qH2eO#jVBtKrW!gqFniOV6pAdWc7rC>u&e8^RPJ9&DJk}0Fn zR^SWn@F8&d!R2{I8_Q8;WIFWUOTE`%4lD3C2qGtF$S5-~e{*tmKqyB;O}H5ez6p~` zW=w|y&_dudYo;H`SnCXk4lamYSCA8a7cb=#FswcdT=8YTqfr0=5G+YVK~#-+VU5o! zL=LXX`hC-Vaj9Q-+;RJ!0k3x)+8_`&gfCgmzIo5TAly1GgRO6JDHe1XmWQ&f-Th zZOXEKR9N$i`X!1imi&7Cyhj$&O1r-vl?*fIHhO(k$^FwHQmQR}k@KtvnPNB_%2w+%J zm2v*kK7*VWWc#25K=?GA)T;&;d}`GGalQ>N(?iZ4&dN!X+|5z*C6It)*$Y66oAp@> zMeG3i{8#|42ecX<>_6vCips7oEpK>6KOT86j~B~lzy=h+rpt3B3O@-uXxPo?iQ6C+ zVgZ#7YVpkX>wF*3;f`z$174RGt0jBFXPoj|fZ%G&wuI;fz`W77-TUFp+Oo?m^CTK6 zcuJdyiB_MDd}L=AI=k=SSnyLJ;Dv_9TS~F>NC~{w6qlT@HC<$Eo+!|6Gx#~Yz(t97 zfy06so1>IC=Vw~}YVO0#nKOGb|JD=WE%#Q4OHPA0qK7#TZO$R>MOEQ&`dR;xIXzyO z4u`N-W;k(YN6z=|yWKsuZEhKd;f@2hdk&2&xG~B3O5EH6?T{DajkPPMOYrT=vv1R} zL+1G+;0v2^$z2NP%|N}oN>qN=#ESY2$(2q%H*L=C4zP0pK11PEVJ!L#Z4UhfQ3PiX z;uPPkL{ITiX##lB(zFg^=^d)gPT<>6T?HR={wpEJ9gi^&2=|fk&!Licb2dzghT7;% zzY=^q(`t|&TaGrFwr*TKIsluq0Hpo&<*_NZ zlWF`u?sU#e6og?3ZV|X@aMo>CzQK1^jcZImD8J;2I29Xd2ZuHwgPg8lFF^C-weG!( z7v6G2Xrk4()11z@A$H-fiuwrxhT)o1X#i)qR?LIkdbG0I7_ZjWr5 zHnsTgdhS{L!meH2MZmX(^4$Wki()*7e}U!i6Nzm*O|@m+_~<4tvkIpLlt@=#0q_k zZSntcg)c|H>Psu@a3ot^$x^`L0Z+CHnO{;MF$yeL;O@1v+4l;#Cd&;n;-aiUx2=9` z;J9(b_I7=1q{Kh7*7sY%SY31w1|Q6LGptw7+oPBB&TMmMbee0~QlAU`abrBzdcq3- znrW<2QTY=?4m#>ny%R_}5xCti4Bj`;|5pLNXY0qFjr-t9v?adt(eLZ<5j-n29{>s% zA5s=VtNJsgr1FC*kjO^sR3Q%#2O( zFYzMc#2gFiEF%xuwf(6n&le}0;`~Xb0gt%sl`Fm7RX;l}z?74@(5}aaY%la>Cq7yK z!H4oKaO$rXciU@`JQ!h|k#s{6eAA_%ET1q1pn^3~)5N@l3q)k(i{VQCOg)!q&N8c} z@*qtFt^;Ub2Oc&AVqB1;E@uia&y~#vI%8M(PH`&051IqXTznZ`+NBVoYydaCz1SgM_4^&7cmMsq zF>~=aymB&7_89ncx&SivK4|MF`j$`+*-`!&AD}m37dxU$%^eQ_Y#+IER?i z^-QA`Jj3iZ_`s#eyb4LomON3_(9gl zK5b7s9zYWDj3Uo&$RRIER*64o+S1P^1E8)Yxu#Y7apTBe9Ai+Y5^M1rw#ub3wproo zVqJMZv;f}Mg`?h*-Q$hxeZK$*-;R1?8=sF_OPmA_?F7zYDURLE9KGL{pc{u?IY`Fy zBQHsT+)|*86ZL%u4!B=L%ZhqoS!LbR6hEiIbzG~TO~vw>Og)+|?uE$Ux31~+kD3zV z2ItLl_wns$xfDPLU1G6603N=>GJ65!Nb>PLYDG6WR{i1=hzgbf07>ob4Xf73yMyL~ zyL2WLtMp{z(9TxS?M1ZtG49_JYOnWQF4xv%>X3P}O92VK*`=?nJqiUp)visE+$aCi zRpmTf?@ERc%u`!74M5_tstO=C-&N(AXqA>o9LZnj*0t{2s%aAelNX@9p^21BCl6;z zGtd}7EH(z~5gZ>OF&g&{d)LvRxd>@B_};yipSpyrl7=W1>%UKo9R6Gkt`=F9I~y#C4b zND8E6$U3qb86zo>T?#-H@gz9lL$R2zF}5U%HReAbacP5ZyACahJc42mV>^k5u&GFg z6ixyNL*0$T!KWxrymV?gor?0SNoDm-sg;(c@<9O!zWJcKY%?Va@S;BRnXidXvklQ# z01h?cD6c*}*Y{9`dlqfdUuoNMWB)Mp5hf}5P<%X2MF0C|vQOZ8Kiy-m!G;rYnXF&d zl^qI5@XZcgWvy{2ARzDr5XH%5Or{-rPjwk|e8m)F3Ym7a=QXveb~SoNfBy>gFlgI? zi7Jc(2OQGBAnMp+*6~fhR#`)~DUkNAYPLHl>z5RWM*%?ip7HY7DV;Pl6sPm2;8@cZ zmC(4LSTUFmu-IvgvM=!|Ai+02jbskHQvjQAb89R1wR<=9{Ug(oMO-}x;s{blOY6eN zXGp;7dzp?8r&WpFmm1#X7_Zf0%82~dxSFsDd>1%u=j z(aIG{e4BeA{?ee#^Xw)HHrVv@{uS9|}nDwa;91 z`kv5HUhZdA*dVfy+hXA@M1-X?vI8-z)-@jPg}4^UA6`Vg1?Y^%h79R}$8Im@>uv14I#Ptwtue z`kYm~jI$R7B>38EK00l0$gnqLy${A3Ah_6ld;#s>o|>{{%+FWA<NJiS4$0j8Y6prUBNWqH}10up@fK0}?L ze~pceDieX*0iLgyYHbDr5>s@Ga!o>q31Gs$nU7+->Fc1*=k)&UE491yh^a z-~9ggwP+e6_d%Uy6EgmQ3p9rmI~3@|6;ih+PMon7x~VXTliCFV<{G#Sx+EFDZK<`? zIZAP+UEDcF(z!<=1@oPIer+>4QZt)X>x1)5E0;h}jS|Z9!On8g7r&>2(;ESmDXh%+ z-<>L-JE@}nJKGEBT0nqj|Bp?7_UeUMz&?BUm3o~Wx5zf#O#~y=`y^s_rR0_PK}A`Iq(B-JXsC7_ zgAKBWpyd$deH2H;s=m!#ePj-ZC%%;co8hB zrKPvv=mk448WkEE+(V#}XR)!ycNMAp`e@~3eEumQ!8iYNAbZe7DFEO-N@&{4QSDfC z&}GjB3l=y#f^B!?*@OGNk3imQu$ci_uaB$z!a_DMZysxB49g9-&N!=bOtz=M0}r@M zHmvu40^mF-klM(vn})(&&?VSXn+VHqee~r4yscbOehwSQuj=FF zz5OU4Rn6=-C$dLffC4}KzAxH)C(Z+al`V`^+ zm|9W)S^D+J8gfAaDf7(*m1P@}0u~e?|8%Rq3>@UxL*1bIT8iL!?J#)1JkgvL!i7B{ z-#sl99}EkzUnq>U9e9zKiX!prvNBIoN4&hVGX*60+IekQ9&`_ +Socio-epistemic networks: Modelling Historical Knowledge Processes, +part of Department I of the Max Planck Institut for the History of Science and +funded by the Federal Ministry of Education and Research, Germany (Grant No. 01 UG2131). + +.. image:: _static/bmbf.png .. toctree:: - :maxdepth: 2 + :maxdepth: 3 :caption: Contents: cleaning diff --git a/docs/linkage.rst b/docs/linkage.rst index 3e65b98..8e3b989 100644 --- a/docs/linkage.rst +++ b/docs/linkage.rst @@ -1,11 +1,14 @@ Word scoring and linkage ======================== +Link papers by Ngram scoring +**************************** .. automodule:: semanticlayertools.linkage.wordscore :members: :undoc-members: - +Generate network of cocitations +******************************* .. automodule:: semanticlayertools.linkage.cocitation :members: :undoc-members: diff --git a/src/semanticlayertools/clustering/infomap.py b/src/semanticlayertools/clustering/infomap.py index f566d68..4bbabd2 100644 --- a/src/semanticlayertools/clustering/infomap.py +++ b/src/semanticlayertools/clustering/infomap.py @@ -1,45 +1,91 @@ import os +import re from tqdm import tqdm import infomap class Clustering(): - """Cluster using infomap.""" + """Cluster mulitlayer time-dependent networks using the infomap algorithm. + + Calculates clusters using the infomap algorithm. Input files are assumed + to have multilayer Pajek format and contain the year in four digit format. + The default settings for running the method assume an undirected multilayer + network and will use at most 5 optimization runs. + + :param inpath: Path to input pajek files + :type inpath: str + :param outpath: Path for writing resulting cluster data + :type outpath: str + :param recreate: Toggle recreation of already exisiting files + :type recreate: bool + :param infomapSettings: Initializing arguments for the infomap algorithm. + :type infomapSettings: str + :param debug: Toggle writing of debug info to standard output. + :type debug: bool + + .. seealso:: + Martin Rosvall and Carl T. Bergstrom (2008). + Maps of information flow reveal community structure in complex networks. + PNAS, 105, 1118. 10.1073/pnas.0706851105 + """ def __init__( self, - infomapSettings="-N5 -imultilayer -fundirected --silent" + inpath: str, + outpath: str, + recreate: bool = False, + infomapSettings: str = "-N5 -imultilayer -fundirected --silent", + debug: bool = False ): + self.inpath = inpath + self.outpath = outpath self.infomult = infomap.Infomap(infomapSettings) + self.recreate = recreate + self.debug = debug + + def calcInfomap(self, inFilePath): + """Calculate clusters for one pajek file. + + Writes found cluster (i.e. module) information in CLU and FlowTree file + format to output path. + + :param inFilePath: Path to input pajek file + :type inFilePath: str + :raises OSError: If one of the output files for this year already exists. + :returns: Writes two files with found cluster information, method return value is empty + :rtype: None - def calcInfomap(self, inFilePath, outPath, recreate=False, debug=False): - """Calc clusters for one pajekt file.""" - year = inFilePath.split(os.path.sep)[-1].split('_')[1].split('.')[0] - cluFilePath = f'{outPath}slice_{year}.clu' - ftreeFilePath = f'{outPath}slice_{year}.ftree' + .. seealso:: + Infomap python documentation on mapequation + `Infomap module `_ + """ + filename = inFilePath.split(os.pathsep)[-1] + year = re.findall(r'\d{4}', filename)[0] + cluFilePath = f'{self.outpath}slice_{year}.clu' + ftreeFilePath = f'{self.outpath}slice_{year}.ftree' if os.path.isfile(cluFilePath) or os.path.isfile(ftreeFilePath): - if recreate is False: - raise IOError( + if self.recreate is False: + raise OSError( f'Files at {cluFilePath} or {ftreeFilePath} exists. Set recreate = True to rewrite files.' ) - if recreate is True: + if self.recreate is True: os.remove(cluFilePath) os.remove(ftreeFilePath) self.infomult.readInputData(inFilePath) self.infomult.run() self.infomult.writeClu(cluFilePath) self.infomult.writeFlowTree(ftreeFilePath) - if debug: + if self.debug is True: print( f"Clustered in {self.infomult.maxTreeDepth()} levels with codelength {self.infomult.codelength}" ) print("\tDone: Slice {0}!".format(year)) return - def run(self, pajekPath='./', outPath='./', recreate=False, debug=False): - """Calculate infomap clustering for all pajek files in path.""" + def run(self): + """Calculate infomap clustering for all pajek files in input path.""" pajekFiles = sorted( - [pajekPath + x for x in os.listdir(pajekPath) if x.endswith('.net')] + [self.inpath + x for x in os.listdir(self.inpath) if x.endswith('.net')] ) for file in tqdm(pajekFiles): - self.calcInfomap(inFilePath=file, outPath=outPath, debug=debug) + self.calcInfomap(inFilePath=file) diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py index c695d21..2d2e561 100644 --- a/src/semanticlayertools/clustering/leiden.py +++ b/src/semanticlayertools/clustering/leiden.py @@ -1,24 +1,51 @@ import os import time import re -from typing import TypeVar - from tqdm import tqdm import igraph as ig import leidenalg as la -debugVar = TypeVar('debugVar', bool, str) - class TimeCluster(): - """Cluster time-sliced data with the Leiden algorithm.""" + """Cluster time-sliced data with the Leiden algorithm. + + Calculates temporal clusters of e.g. time-sliced cocitation or citation + data, using the Leiden algorithm . Two nodes are assumed to be identical in + different year slices, if the node name is the same. + This could be e.g. the bibcode or DOI. + + Input files are assumed to include the year in the filename, have an ending + `_GC.net` to denote their giant component character and should be in Pajek + format. + + The resolution parameter can be seen as a limiting density, above + which neighbouring nodes are considered a cluster. The interslice coupling + describes the influcence of yearly order on the clustering process. See doc + for the Leiden algorithm for more detailed info. + + :param inpath: Path for input network data + :type inpath: str + :param outpath: Path for writing output data + :type outpath: str + :param resolution: Main parameter for the clustering quality function (Constant Pots Model) + :type resolution: float + :param intersliceCoupling: Coupling parameter between two year slices, also influences cluster detection + :type intersliceCoupling: float + :param timerange: The time range for considering input data (default=1945,2005)) + :type timerange: tuple + :raises OSError: If the output file already exists at class instantiation + + .. seealso:: + Traag, V.A., Waltman. L., Van Eck, N.-J. (2018). + From Louvain to Leiden: guaranteeing well-connected communities. + Scientific reports, 9(1), 5233. 10.1038/s41598-019-41695-z + """ def __init__( self, inpath: str, outpath: str, resolution: float = 0.003, intersliceCoupling: float = 0.4, timerange: tuple = (1945, 2005), - debug: debugVar = False ): starttime = time.time() self.inpath = inpath @@ -26,7 +53,6 @@ def __init__( self.res_param = resolution self.interslice_param = intersliceCoupling self.timerange = timerange - self.debug = debug self.outfile = os.path.join( outpath, @@ -57,8 +83,28 @@ def __init__( f"loaded in {time.time() - starttime} seconds." ) - def optimize(self, clusterSizeCompare: int=1000): - """Optimize clusters accross time slices.""" + def optimize(self, clusterSizeCompare: int = 1000): + """Optimize clusters accross time slices. + + This runs the actual clustering and can be very time and memory + consuming for large networks. Depending on the obtained cluster results, + this method has to be run iteratively with varying resolution parameter. + Output is written to file, with filename containing chosen parameters. + + The output CSV contains information on which node in which year belongs + to which cluster. As a first measure of returned clustering, the method + prints the number of clusters found above a threshold defined by + `clusterSizeCompare`. This does not influence the output clustering. + + :param clusterSizeCompare: Threshold for `interesting` clusters + :type clusterSizeCompare: int + :returns: Tuple of output file path and list of found clusters in tuple format (node, year, cluster) + :rtype: tuple + + .. seealso:: + Documentation of time-layer creation routine: + `Leiden documentation `_ + """ starttime = time.time() layers, interslice_layer, _ = la.time_slices_to_layers( @@ -109,7 +155,9 @@ def optimize(self, clusterSizeCompare: int=1000): outfile.write( f"{elem[0]},{elem[1]},{elem[2]}\n" ) - largeclu = [(x,len(x.vs)) for x in subgraphs if len(x.vs)>clusterSizeCompare] + largeclu = [ + (x, len(x.vs)) for x in subgraphs if len(x.vs) > clusterSizeCompare + ] print( f'Finished in {time.time() - starttime} seconds.' f"Found {len(subgraphs)} clusters, with {len(largeclu)} larger then {clusterSizeCompare} nodes." diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py index a4b27c6..1b9ff04 100644 --- a/src/semanticlayertools/clustering/reports.py +++ b/src/semanticlayertools/clustering/reports.py @@ -9,7 +9,6 @@ import textacy import textacy.tm import pandas as pd -import numpy as np import warnings num_processes = multiprocessing.cpu_count() @@ -19,15 +18,54 @@ class ClusterReports(): + """Generate reporting on time-clusters. + + Generate reports to describe the content for all found clusters above a + minimal size by collecting metadata for all publications in each cluster, + finding the top 20 authors and affiliations of authors involved in the + cluster publications, and running basic NMF topic modelling with N=20 and + N=50 topics (english language models are used!). + For each cluster a report file is written to the output path. + + Input CSV filename is used to create the output folder in output path. For + each cluster above the limit, a subfolder is created to contain all metadata + for the cluster. The metadata files are assumed to be in JSONL format and + contain the year in the filename. + + :param infile: Path to input CSV file containing information on nodeid, clusterid, and year + :type infile: str + :param metadatapath: Path to JSONL (JSON line) formated metadata files. + :type metadatapath: str + :param outpath: Path to create output folder in, foldername reflects input filename + :type outpath: str + + :param textcolumn: The dataframe column of metadata containing textutal for topic modelling (default=title) + :type textcolumn: str + :param numberProc: Number of CPU the routine will use (default = all!) + :type numberProc: int + :param minClusterSize: The minimal cluster size, above which clusters are considered (default=1000) + :type minClusterSize: int + :param timerange: Time range to evalute clusters for (usefull for limiting computation time, default = (1945, 2005)) + :type timerange: tuple + """ def __init__( self, infile: str, metadatapath: str, outpath: str, + textcolumn: str = 'title', + authorColumnName: str = 'author', + affiliationColumnName: str = 'aff', + publicationIDcolumn: str = 'nodeID', numberProc: int = num_processes, minClusterSize: int = 1000, timerange: tuple = (1945, 2005) ): + """Constructor method""" self.numberProc = numberProc self.minClusterSize = minClusterSize self.metadatapath = metadatapath + self.textcolumn = textcolumn + self.authorColumnName = authorColumnName + self.affiliationColumnName = affiliationColumnName + self.publicationIDcolumn = publicationIDcolumn clusterdf = pd.read_csv(infile) basedata = clusterdf.groupby(['year', 'cluster']).size().to_frame('counts').reset_index() self.largeClusterList = list( @@ -47,9 +85,19 @@ def __init__( os.mkdir(os.path.join(self.outpath, f'Cluster_{clu}')) def create_corpus(self, dataframe): - """Create corpus out of dataframe.""" + """Create corpus out of dataframe. + + Using the text contained in the cluster metadata to generate a corpus. + After some basic preprocessing each text is used to generate a Spacy doc, + of which only the lemmatized words without stop words are considered. + + :params dataframe: Input dataframe + :type dataframe: `pd.Dataframe` + :returns: A textacy corpus file with english as the base language + :rtype: `textacy.Corpus` + """ docs = [] - titles = [x[0] for x in dataframe.title.values if type(x) == list] + titles = [x[0] for x in dataframe[self.textcolumn].values if type(x) == list] for title in tqdm(titles, leave=False): try: # text pre-processing @@ -74,7 +122,22 @@ def create_corpus(self, dataframe): def find_topics( self, corpus_titles: list, n_topics: int, top_words: int, ): - """Calculate topics in corpus.""" + """Calculate topics in corpus. + + Use NMF algorithm to calculate topics in corpus file for `n_topics` + topics, returning `top_words` most common words for each topic. + Each word has to occure at least twice in the corpus and at most in 95% + of all documents. + + :param corpus_titles: The corpus containing the preprocessed texts. + :type corpus_titles: `textacy.Corpus` + :param n_topics: Number of considered topics + :type n_topics: int + :param top_words: Number of returned words for each found topic + :type top_words: int + :returns: List of found topics with top occuring words + :rtype: str + """ vectorizer = textacy.representations.vectorizers.Vectorizer( tf_type="linear", idf_type="smooth", @@ -93,17 +156,26 @@ def find_topics( model.fit(doc_term_matrix) topics = [] - for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_words): - topics.append("topic " + str(topic_idx) + ": " + " ".join(top_terms)) + for topic_idx, top_terms in model.top_topic_terms( + vectorizer.id_to_term, top_n=top_words + ): + topics.append( + "topic " + str(topic_idx) + ": " + " ".join(top_terms) + ) outtext = f'\n\n\tTopics in cluster for {n_topics} topics:\n' for topic in topics: outtext += f'\t\t{topic}\n' return outtext - def fullReport(self, cluster, authorColumnName: str = 'author', - affiliationColumnName: str = 'aff' - ): - """Generate full cluster report.""" + def fullReport(self, cluster): + """Generate full cluster report for one cluster. + + :param cluster: The cluster number to process + :type cluster: int or str + :raises ValueError: If input cluster data can not be read. + :returns: Report text with all gathered informations + :rtype: str + """ starttime = time.time() clusterpath = os.path.join(self.outpath, f'Cluster_{cluster}') clusterfiles = os.listdir(clusterpath) @@ -120,13 +192,13 @@ def fullReport(self, cluster, authorColumnName: str = 'author', inputnodes = set(basedf.node.values) notFound = inputnodes.difference(set(dfCluster.nodeID.values)) topAuthors = Counter( - [x for y in dfCluster[authorColumnName].fillna('').values for x in y] + [x for y in dfCluster[self.authorColumnName].fillna('').values for x in y] ).most_common(20) authortext = '' for x in topAuthors: authortext += f'\t{x[0]}: {x[1]}\n' topAffils = Counter( - [x for y in dfCluster[affiliationColumnName].fillna('').values for x in y] + [x for y in dfCluster[self.affiliationColumnName].fillna('').values for x in y] ).most_common(21) affiltext = '' for x in topAffils[1:]: @@ -156,12 +228,19 @@ def fullReport(self, cluster, authorColumnName: str = 'author', print('\t\tFinished topics.') return outtext - def _mergeData(self, filename, publicationIDcolumn: str = 'nodeID'): + def _mergeData(self, filename): + """Merge metadata for cluster nodes. + + Writes all metadata for nodes in cluster to folders. + + :param filename: Metadata input filename + :type filename: str + """ filepath = os.path.join(self.metadatapath, filename) data = pd.read_json(filepath, lines=True) selectMerge = data.merge( self.clusternodes, - left_on=publicationIDcolumn, + left_on=self.publicationIDcolumn, right_on='node', how='inner' ) @@ -177,6 +256,14 @@ def _mergeData(self, filename, publicationIDcolumn: str = 'nodeID'): return '' def gatherClusterMetadata(self): + """Initial gathering of metadata for clusters. + + For all files in the metadata path, call `_mergeData` if the found + year in the filename falls in the bounds. + + This step needs to be run once, the all cluster metadata is generated + and can be reused. + """ filenames = os.listdir(self.metadatapath) yearFiles = [] for x in filenames: @@ -191,6 +278,7 @@ def gatherClusterMetadata(self): return def writeReports(self): + """Generate reports and write to output path.""" for cluster in tqdm(self.largeClusterList, leave=False): outtext = self.fullReport(cluster) with open(f'{self.outpath}Cluster_{cluster}.txt', 'w') as file: diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py index 2872b33..9d30fb6 100644 --- a/src/semanticlayertools/linkage/cocitation.py +++ b/src/semanticlayertools/linkage/cocitation.py @@ -1,4 +1,3 @@ -"""Link documents by cocitation.""" import os import time import re From 62d500a29db36ff186554727f6d9435cbdd56a0e Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 6 Jan 2022 13:32:13 +0100 Subject: [PATCH 25/53] add readthedocs yaml --- .readthedocs.yaml | 15 +++++++++++++++ docs/requirements.txt | 8 ++++++++ 2 files changed, 23 insertions(+) create mode 100644 .readthedocs.yaml create mode 100644 docs/requirements.txt diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..3943fd8 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,15 @@ +# File: .readthedocs.yaml + +version: 2 + +build: + os: "ubuntu-20.04" + tools: + python: "3.9" + +sphinx: + configuration: docs/conf.py + +python: + install: + - requirements: docs/requirements.txt diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..25dc0d6 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,8 @@ +sphinx +sphinx_rtd_theme +plotly +hdbscan +umap-learn +torch +sentence-transformers +https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0.tar.gz#egg=en_core_web_lg From 5c129dad54c4d7ca97b9af760ec298c2ba007c3b Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 6 Jan 2022 14:15:13 +0100 Subject: [PATCH 26/53] add req --- docs/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/requirements.txt b/docs/requirements.txt index 25dc0d6..80636a9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -6,3 +6,4 @@ umap-learn torch sentence-transformers https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0.tar.gz#egg=en_core_web_lg +semanticlayertools From 4b547464eafdc028bd9c0bfc9ed99a715055f457 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 6 Jan 2022 14:39:22 +0100 Subject: [PATCH 27/53] add readme and license files from mainpage to docs --- LICENSE => LICENSE.md | 0 README.md | 16 ++++++++++++++-- docs/conf.py | 4 +++- docs/index.rst | 2 ++ docs/license.rst | 3 +++ docs/readme.rst | 3 +++ docs/requirements.txt | 1 + tox.ini | 1 + 8 files changed, 27 insertions(+), 3 deletions(-) rename LICENSE => LICENSE.md (100%) create mode 100644 docs/license.rst create mode 100644 docs/readme.rst diff --git a/LICENSE b/LICENSE.md similarity index 100% rename from LICENSE rename to LICENSE.md diff --git a/README.md b/README.md index df7a75b..04d4856 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,15 @@ -# SemanticLayerTools +## SemanticLayerTools -Collects tools to create semantic layers in the socio-epistemic networks framework. Source material can be any structured corpus with metadata of authors, time, and at least one text column. \ No newline at end of file +Collects tools to create semantic layers in the socio-epistemic networks framework. Source material can be any structured corpus with metadata of authors, time, and at least one text column. + +## Installation + +Using pip `pip install semanticlayertools` + +## Testing + +Using tox `tox` + +## Building documentation + +Using tox `tox -e docs` diff --git a/docs/conf.py b/docs/conf.py index 24a23b7..dea5654 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -34,7 +34,8 @@ # ones. extensions = [ 'sphinx.ext.autodoc', - 'sphinx.ext.intersphinx' + 'sphinx.ext.intersphinx', + 'm2r2' ] # Add any paths that contain templates here, relative to this directory. @@ -45,6 +46,7 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +source_suffix = [".rst", ".md"] # -- Options for HTML output ------------------------------------------------- diff --git a/docs/index.rst b/docs/index.rst index 112785c..f53ba76 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -20,11 +20,13 @@ funded by the Federal Ministry of Education and Research, Germany (Grant No. 01 :maxdepth: 3 :caption: Contents: + readme cleaning pipelines linkage clustering visual + license diff --git a/docs/license.rst b/docs/license.rst new file mode 100644 index 0000000..4f3620b --- /dev/null +++ b/docs/license.rst @@ -0,0 +1,3 @@ +License +======= +.. mdinclude:: ../LICENSE.md diff --git a/docs/readme.rst b/docs/readme.rst new file mode 100644 index 0000000..60d167e --- /dev/null +++ b/docs/readme.rst @@ -0,0 +1,3 @@ +README +====== +.. mdinclude:: ../README.md diff --git a/docs/requirements.txt b/docs/requirements.txt index 80636a9..63818de 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,6 @@ sphinx sphinx_rtd_theme +m2r2 plotly hdbscan umap-learn diff --git a/tox.ini b/tox.ini index 40887f1..24a7a07 100644 --- a/tox.ini +++ b/tox.ini @@ -21,6 +21,7 @@ basepython = python3.9 deps = sphinx sphinx_rtd_theme + m2r2 plotly hdbscan umap-learn From 0eb2e2115561b5f6661f518be5b85a84c6ee79e6 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Fri, 7 Jan 2022 11:16:31 +0100 Subject: [PATCH 28/53] clean doc building --- README.md | 47 +++++++++++++++- docs/index.rst | 2 +- docs/readme.rst | 4 +- docs/requirements.txt | 6 -- docs/visual.rst | 58 +++++++++++++++++++- setup.cfg | 7 +-- src/semanticlayertools/clustering/reports.py | 6 +- src/semanticlayertools/visual/utils.py | 31 ++++++----- tox.ini | 6 -- 9 files changed, 124 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 04d4856..ac127c3 100644 --- a/README.md +++ b/README.md @@ -2,14 +2,55 @@ Collects tools to create semantic layers in the socio-epistemic networks framework. Source material can be any structured corpus with metadata of authors, time, and at least one text column. +Documentation is available on [ReadTheDocs](https://semanticlayertools.readthedocs.io/). + ## Installation -Using pip `pip install semanticlayertools` +tl;dr Use pip + +~~~bash +pip install semanticlayertools +~~~ + +Consider using a clean virtual environment to keep your main packages separated. +Create a new virtual environment and install the package + +~~~bash +python3 -m venv env +source env/bin/activate +pip install semanticlayertools +~~~ + +To use some sentence embedding utility functions please install with the +`embeddml` option + +~~~bash +pip install semanticlayertools[embeddml] +~~~ ## Testing -Using tox `tox` +Tests can be run by installing the _dev_ requirements and running `tox`. + +~~~bash +pip install semanticlayertools[dev] +tox +~~~ ## Building documentation -Using tox `tox -e docs` +The documentation is build using _sphinx_. Install with the _dev_ option and run + +~~~bash +pip install semanticlayertools[dev] +tox -e docs +~~~ + +## Funding information + +The development is part of the research project [ModelSEN](https://modelsen.mpiwg-berlin.mpg.de) + +> Socio-epistemic networks: Modelling Historical Knowledge Processes, + +in Department I of the Max Planck Institute for the History of Science +and funded by the Federal Ministry of Education and Research, Germany (Grant No. 01 UG2131). diff --git a/docs/index.rst b/docs/index.rst index f53ba76..64da186 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,7 +11,7 @@ social, semiotic or semantic layers from text corpora. The development is part of the research project `ModelSEN `_ Socio-epistemic networks: Modelling Historical Knowledge Processes, -part of Department I of the Max Planck Institut for the History of Science and +part of Department I of the Max Planck Institute for the History of Science and funded by the Federal Ministry of Education and Research, Germany (Grant No. 01 UG2131). .. image:: _static/bmbf.png diff --git a/docs/readme.rst b/docs/readme.rst index 60d167e..21a7aa4 100644 --- a/docs/readme.rst +++ b/docs/readme.rst @@ -1,3 +1,3 @@ -README -====== +Introduction +============ .. mdinclude:: ../README.md diff --git a/docs/requirements.txt b/docs/requirements.txt index 63818de..c9c7e88 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,10 +1,4 @@ sphinx sphinx_rtd_theme m2r2 -plotly -hdbscan -umap-learn -torch -sentence-transformers -https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0.tar.gz#egg=en_core_web_lg semanticlayertools diff --git a/docs/visual.rst b/docs/visual.rst index 29e6a31..e5e6eec 100644 --- a/docs/visual.rst +++ b/docs/visual.rst @@ -1,6 +1,58 @@ Utility functions for visualizations ==================================== -.. automodule:: semanticlayertools.visual.utils - :members: - :undoc-members: +The usage of some of these methods requires installing the package with +the extra requirements for text embedding and clustering + +.. code-block:: bash + :linenos: + + pip install semanticlayertools[embeddml] + + +Representing temporal cluster evolution with a streamgraph +********************************************************** + +This utility function is meant to support the visualization of calculated +temporal clusters. Parameters to vary are the smoothing (bool) and the minimal +cluster size to consider (default=1000). + +.. code-block:: python + :linenos: + + streamgraph(file, smooth, minClusterSize) + + +Embedding a text corpus in 2 dimensions +*************************************** + +Meant to be used to visualize a corpus on 2D by embedding a text column using +the SentenceTransformer approach of SBERT and UMAP. Time consuming method! + +.. code-block:: python + :linenos: + + embeddedTextPlotting(infolderpath, columnName, outpath, umapNeighors) + +.. seealso :: + `SBERT docs `_ + `UMAP docs `_ + + +Clustering texts using SentenceEmbedding +**************************************** + +Similar to the above method but extended to help finding large scale structures +of a given text corpus. Similar to topic modelling, in addition makes use of +HDBSCAN clustering. Reuses previously generated embedding of corpus. + +.. code-block:: python + :linenos: + + embeddedTextClustering( + infolderpath, columnName, embeddingspath, outpath, + umapNeighors, umapComponents, hdbscanMinCluster + ) + +.. seealso :: + `HDBSCAN docs `_ diff --git a/setup.cfg b/setup.cfg index a328a5e..6e49e3f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,16 +36,11 @@ install_requires = [options.extras_require] all = %(embeddml)s - %(doc)s %(dev)s - %(test)s -doc = - sphinx dev = twine - %(test)s -test = tox + sphinx embeddml = torch umap-learn diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py index 1b9ff04..2cbcd53 100644 --- a/src/semanticlayertools/clustering/reports.py +++ b/src/semanticlayertools/clustering/reports.py @@ -13,9 +13,6 @@ num_processes = multiprocessing.cpu_count() -mainLanguageCorp = 'en_core_web_lg' -nlp = spacy.load(mainLanguageCorp) - class ClusterReports(): """Generate reporting on time-clusters. @@ -96,6 +93,9 @@ def create_corpus(self, dataframe): :returns: A textacy corpus file with english as the base language :rtype: `textacy.Corpus` """ + mainLanguageCorp = 'en_core_web_lg' + nlp = spacy.load(mainLanguageCorp) + docs = [] titles = [x[0] for x in dataframe[self.textcolumn].values if type(x) == list] for title in tqdm(titles, leave=False): diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py index 5952052..85febac 100644 --- a/src/semanticlayertools/visual/utils.py +++ b/src/semanticlayertools/visual/utils.py @@ -6,10 +6,6 @@ import numpy as np from scipy import stats -from collections import Counter -import plotly.express as px -import plotly.graph_objects as go - from sentence_transformers import SentenceTransformer import umap import hdbscan @@ -24,7 +20,10 @@ def gaussian_smooth(x, y, grid, sd): return (weights * y).sum(1) -def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000, showNthGrid: int=5): +def streamgraph( + filepath: str, smooth: smoothing = False, + minClusterSize: int = 1000, showNthGrid: int = 5 +): """Plot streamgraph of cluster sizes vs years. Based on https://www.python-graph-gallery.com/streamchart-basic-matplotlib @@ -59,8 +58,8 @@ def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000 grid, y_smoothed, labels=cluDict.keys(), - baseline="sym" - ,colors=plt.get_cmap('tab20').colors + baseline="sym", + colors=plt.get_cmap('tab20').colors ) pass @@ -86,7 +85,10 @@ def streamgraph(filepath: str, smooth: smoothing=False, minClusterSize: int=1000 return fig -def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str): +def embeddedTextPlotting( + infolderpath: str, columnName: str, outpath: str, + umapNeighors: int = 200, +): """Create embedding for corpus text.""" print('Initializing embedder model.') model = SentenceTransformer('all-MiniLM-L6-v2') @@ -113,7 +115,7 @@ def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str): ) print('\tDone\nStarting mapping to 2D.') corpus_embeddings_2D = umap.UMAP( - n_neighbors=15, + n_neighbors=umapNeighors, n_components=2, metric='cosine' ).fit_transform(corpus_embeddings) @@ -130,7 +132,9 @@ def embeddedTextPlotting(infolderpath: str, columnName: str, outpath: str): def embeddedTextClustering( - infolderpath: str, columnName: str, emdeddingspath: str, outpath: str + infolderpath: str, columnName: str, emdeddingspath: str, outpath: str, + umapNeighors: int = 200, umapComponents: int = 50, + hdbscanMinCluster: int = 500, ): """Create clustering based on embedding for corpus texts.""" print('Initializing embedder model.') @@ -149,8 +153,9 @@ def embeddedTextClustering( corpus_embeddings = torch.load(emdeddingspath) print('\tDone\nStarting mapping to lower dimensions.') corpus_embeddings_50D = umap.UMAP( - n_neighbors=15, - n_components=50, + n_neighbors=umapNeighors, + n_components=umapComponents, + min_dist=0.0, metric='cosine' ).fit_transform(corpus_embeddings) np.savetxt( @@ -161,7 +166,7 @@ def embeddedTextClustering( ) print('\tDone.\nStarting clustering.') cluster = hdbscan.HDBSCAN( - min_cluster_size=20, + min_cluster_size=hdbscanMinCluster, metric='euclidean', cluster_selection_method='eom' ).fit(corpus_embeddings_50D) diff --git a/tox.ini b/tox.ini index 24a7a07..5047f72 100644 --- a/tox.ini +++ b/tox.ini @@ -22,11 +22,5 @@ deps = sphinx sphinx_rtd_theme m2r2 - plotly - hdbscan - umap-learn - torch - sentence-transformers - https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0.tar.gz#egg=en_core_web_lg commands = sphinx-build -d "{toxworkdir}/docs_doctree" docs "{toxworkdir}/docs_out" --color -W -bhtml {posargs} python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))' From 2ed0677f42d04f062421223736ee973d02879b79 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Fri, 7 Jan 2022 11:36:30 +0100 Subject: [PATCH 29/53] bump version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 6e49e3f..a18cfc9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = semanticlayertools -version = 0.0.3 +version = 0.0.4 author = Malte Vogl author_email = mvogl@mpiwg-berlin.mpg.de description = Create semantic layers using different methods for word linking. From ffbadc1a69c35a464d5a307e06dfa56d93b54134 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 24 Jan 2022 15:00:27 +0100 Subject: [PATCH 30/53] fix link in docs --- docs/visual.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/visual.rst b/docs/visual.rst index e5e6eec..3a1b6f3 100644 --- a/docs/visual.rst +++ b/docs/visual.rst @@ -55,4 +55,4 @@ HDBSCAN clustering. Reuses previously generated embedding of corpus. ) .. seealso :: - `HDBSCAN docs `_ + `HDBSCAN docs `_ From 1e497e9114795c3a4042f45905af79831744b277 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 28 Feb 2022 14:28:37 +0100 Subject: [PATCH 31/53] linting and add cleaning to docs, extend docs for pipelines --- docs/pipelines.rst | 4 ++ src/semanticlayertools/cleaning/text.py | 24 ++++++++-- src/semanticlayertools/clustering/infomap.py | 2 +- src/semanticlayertools/clustering/leiden.py | 2 +- src/semanticlayertools/clustering/reports.py | 6 +-- src/semanticlayertools/linkage/cocitation.py | 10 ++-- src/semanticlayertools/linkage/wordscore.py | 47 ++++++++++--------- .../pipelines/cocitetimeclusters.py | 37 ++++++++++++++- .../pipelines/wordscorenet.py | 30 +++++++++++- tests/linkage/test_wordscore.py | 1 + tox.ini | 6 +++ 11 files changed, 131 insertions(+), 38 deletions(-) diff --git a/docs/pipelines.rst b/docs/pipelines.rst index 0c2ff90..511a6f4 100644 --- a/docs/pipelines.rst +++ b/docs/pipelines.rst @@ -1,11 +1,15 @@ Pipelines for workflows ======================= +Cocitation clustering pipeline +****************************** .. automodule:: semanticlayertools.pipelines.cocitetimeclusters :members: :undoc-members: +Wordscore-Multilayer pipeline +***************************** .. automodule:: semanticlayertools.pipelines.wordscorenet :members: :undoc-members: diff --git a/src/semanticlayertools/cleaning/text.py b/src/semanticlayertools/cleaning/text.py index 7f3889e..dff5e7f 100644 --- a/src/semanticlayertools/cleaning/text.py +++ b/src/semanticlayertools/cleaning/text.py @@ -8,7 +8,17 @@ def lemmaSpacy(text): - """Clean text in dataframe column.""" + """Clean text using Spacy english language model. + + A spacy doc is created using the text. For each token which is not a + stopword and longer then 3 letters the lemma is returned in lowered form. + For historical reasons, input can also be of the form + text = list("Actual text"), which sometimes results from data harvesting. + In these cases only the first element is considered! + + :param text: Input text + :type text: str + """ try: if isinstance(text, list): text = text[0] @@ -17,12 +27,20 @@ def lemmaSpacy(text): [t.lemma_ for t in doc if not t.is_stop and len(t) > 3] ) return tokens.lower() - except: + except Exception: raise def htmlTags(text): - """Remove html tags in text.""" + """Reformat html tags in text using replacement list.. + + Some specific html formating leads to confusion with sentence and token + border detection. This method outputs the cleaned + text using a replacement list. + + :param text: Input text + :type text: str + """ if isinstance(text, list): text = text[0] for tagPair in [ diff --git a/src/semanticlayertools/clustering/infomap.py b/src/semanticlayertools/clustering/infomap.py index 4bbabd2..318cc0d 100644 --- a/src/semanticlayertools/clustering/infomap.py +++ b/src/semanticlayertools/clustering/infomap.py @@ -67,7 +67,7 @@ def calcInfomap(self, inFilePath): if self.recreate is False: raise OSError( f'Files at {cluFilePath} or {ftreeFilePath} exists. Set recreate = True to rewrite files.' - ) + ) if self.recreate is True: os.remove(cluFilePath) os.remove(ftreeFilePath) diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py index 2d2e561..bd39d9e 100644 --- a/src/semanticlayertools/clustering/leiden.py +++ b/src/semanticlayertools/clustering/leiden.py @@ -68,7 +68,7 @@ def __init__( for idx in tqdm(range(len(edgefiles)), leave=False): try: year = re.findall(r'\d{4}', edgefiles[idx])[0] - except: + except Exception: raise if timerange[0] <= int(year) <= timerange[1]: graph = ig.Graph.Read_Pajek(os.path.join(inpath, edgefiles[idx])) diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py index 2cbcd53..d6ba223 100644 --- a/src/semanticlayertools/clustering/reports.py +++ b/src/semanticlayertools/clustering/reports.py @@ -102,7 +102,7 @@ def create_corpus(self, dataframe): try: # text pre-processing title = re.sub("\n", " ", title) - title = re.sub("[\r|\t|\x0c|\d+]", "", title) + title = re.sub("[\r|\t|\x0c|\d+]", "", title) # noqa: W605 title = re.sub("[.,]", "", title) title = re.sub("\\\'s", "'s", title) title = title.lower() @@ -112,7 +112,7 @@ def create_corpus(self, dataframe): tokens_without_sw = ' '.join([t.lemma_ for t in doc if not t.is_stop]) docs.append(tokens_without_sw) - except: + except Exception: print(title) raise @@ -269,7 +269,7 @@ def gatherClusterMetadata(self): for x in filenames: try: year = int(re.findall(r'\d{4}', x)[0]) - except: + except Exception: raise if self.timerange[0] <= year <= self.timerange[1]: yearFiles.append(x) diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py index 9d30fb6..e22bed2 100644 --- a/src/semanticlayertools/linkage/cocitation.py +++ b/src/semanticlayertools/linkage/cocitation.py @@ -110,7 +110,7 @@ def calculateCoCitation(self, filepath): tempG = ig.Graph.TupleList(sortCoCitCounts, weights=True, vertex_name_attr='id') components = tempG.components() sortedComponents = sorted( - [(x, len(x), len(x)*100/len(tempG.vs)) for x in components], key=lambda x: x[1], reverse=True + [(x, len(x), len(x) * 100 / len(tempG.vs)) for x in components], key=lambda x: x[1], reverse=True ) with open(os.path.join(self.outpath, infilename + '_graphMetadata.txt'), 'w') as outfile: outfile.write(f'Graph derived from {filepath}\nSummary:\n') @@ -125,7 +125,7 @@ def calculateCoCitation(self, filepath): elem[1], elem[2], len(gcompTemp.es), - len(gcompTemp.es)*100/len(tempG.es) + len(gcompTemp.es) * 100 / len(tempG.es) ) giantComponent = sortedComponents[0] giantComponentGraph = tempG.vs.select(giantComponent[0]).subgraph() @@ -135,7 +135,7 @@ def calculateCoCitation(self, filepath): with open(os.path.join(self.outpath, infilename + '.ncol'), 'w') as outfile: for edge in sortCoCitCounts: outfile.write(f"{edge[0]} {edge[1]} {edge[2]}\n") - except: + except Exception: raise if self.debug == "l2": print(f'\tDone in {time.time() - starttime} seconds.') @@ -153,7 +153,7 @@ def processFolder(self): for file in tqdm(os.listdir(self.inpath), leave=False): try: year = re.findall(r'\d{4}', file)[0] - except: + except Exception: raise if self.timerange[0] <= int(year) <= self.timerange[1]: try: @@ -163,7 +163,7 @@ def processFolder(self): gcmetafile.write( f'{year},{outtuple[0]},{outtuple[1]},{outtuple[2]},{outtuple[3]}\n' ) - except: + except Exception: raise if self.debug is True: print(f'\tDone in {time.time() - starttime} seconds.') diff --git a/src/semanticlayertools/linkage/wordscore.py b/src/semanticlayertools/linkage/wordscore.py index e36721b..36552c7 100644 --- a/src/semanticlayertools/linkage/wordscore.py +++ b/src/semanticlayertools/linkage/wordscore.py @@ -93,7 +93,7 @@ def getScore(self, target): lvalue = len(set(x for x in contains if x[1] == subgram)) valueList.append((lvalue + 1) * (rvalue + 1)) return { - target: 1/self.counts[target] * (np.prod(valueList)) ** (1 / (2.0 * len(target))) + target: 1 / self.counts[target] * (np.prod(valueList)) ** (1 / (2.0 * len(target))) } def _calcBatch(self, batch): @@ -109,13 +109,13 @@ def run(self, write=False, outpath='./', recreate=False, limitCPUs=True): if self.debug is True: print(f'Found {len(self.uniqueNGrams)} unique {self.ngramEnd}-grams.') if limitCPUs is True: - ncores = int(cpu_count()*1/4) + ncores = int(cpu_count() * 1 / 4) else: ncores = cpu_count() - 2 pool = Pool(ncores) - chunk_size = int(len(self.uniqueNGrams)/ncores) + chunk_size = int(len(self.uniqueNGrams) / ncores) batches = [ - list(self.uniqueNGrams)[i:i+chunk_size] for i in range(0, len(self.uniqueNGrams), chunk_size) + list(self.uniqueNGrams)[i:i + chunk_size] for i in range(0, len(self.uniqueNGrams), chunk_size) ] ncoresResults = pool.map(self._calcBatch, batches) results = [x for y in ncoresResults for x in y] @@ -133,7 +133,7 @@ def run(self, write=False, outpath='./', recreate=False, limitCPUs=True): if recreate is False: raise IOError( f'File at {filePath} exists. Set recreate = True to rewrite file.' - ) + ) if recreate is True: os.remove(filePath) with open(filePath, 'a') as yearfile: @@ -241,7 +241,7 @@ def writeLinks(self, sl, scorePath, scoreLimit, outpath='./', recreate=False): if recreate is False: raise IOError( f'File at {filePath} exists. Set recreate = True to rewrite file.' - ) + ) if recreate is True: os.remove(filePath) @@ -274,21 +274,23 @@ def writeLinks(self, sl, scorePath, scoreLimit, outpath='./', recreate=False): if len(authors) >= 2: # pairs = [x for x in combinations(authors, 2)] for pair in combinations(authors, 2): # pairs: - file.write('{0} {1} {2} {3} 1\n'.format( - 1, - self.nodeMap[pair[0]], - 1, - self.nodeMap[pair[1]] + file.write( + '{0} {1} {2} {3} 1\n'.format( + 1, + self.nodeMap[pair[0]], + 1, + self.nodeMap[pair[1]] ) ) for author in authors: try: authNr = self.nodeMap[author] - file.write('{0} {1} {2} {3} 1\n'.format( - 1, - authNr, - 2, - paperNr + file.write( + '{0} {1} {2} {3} 1\n'.format( + 1, + authNr, + 2, + paperNr ) ) except KeyError: @@ -297,12 +299,13 @@ def writeLinks(self, sl, scorePath, scoreLimit, outpath='./', recreate=False): try: ngramNr = self.nodeMap[ngramrow[1]] weight = ngramrow[2] - file.write('{0} {1} {2} {3} {4}\n'.format( - 2, - paperNr, - 3, - ngramNr, - weight + file.write( + '{0} {1} {2} {3} {4}\n'.format( + 2, + paperNr, + 3, + ngramNr, + weight ) ) except KeyError: diff --git a/src/semanticlayertools/pipelines/cocitetimeclusters.py b/src/semanticlayertools/pipelines/cocitetimeclusters.py index 1762507..ea64e64 100644 --- a/src/semanticlayertools/pipelines/cocitetimeclusters.py +++ b/src/semanticlayertools/pipelines/cocitetimeclusters.py @@ -1,4 +1,4 @@ -"""Runs all steps to create reports for cocite temporal network clustering.""" + import time import os import multiprocessing @@ -23,6 +23,41 @@ def run( numberproc: int = num_processes, limitRefLength=False, debug=False ): + """Runs all steps of the temporal clustering pipepline. + + Creates cocitation networks, finds temporal clusters, writes report files + for large clusters. + + Default time range is 1945 to 2005. Minimal size for considered clusters is + 1000 nodes. Lists of references are assumed to be contained in column + "reference". + + By default this routine takes all available cpu cores. Limit this + to a lower value to allow parallel performance of other tasks. + + :param inputFilepath: Path to corpora input data + :type text: str + :param cociteOutpath: Output path for cocitation networks + :type text: str + :param timeclusterOutpath: Output path for time clusters + :type text: str + :param reportsOutpath: Output path for reports + :type text: str + :param resolution: Main parameter for the clustering quality function (Constant Pots Model) + :type resolution: float + :param intersliceCoupling: Coupling parameter between two year slices, also influences cluster detection + :type intersliceCoupling: float + :param minClusterSize: The minimal cluster size, above which clusters are considered (default=1000) + :type minClusterSize: int + :param timerange: Time range to evalute clusters for (usefull for limiting computation time, default = (1945, 2005)) + :type timerange: tuple + :param referenceColumnName: Column name containing the references of a publication + :type referenceColumnName: str + :param numberProc: Number of CPUs the package is allowed to use (default=all) + :type numberProc: int + :param limitRefLength: Either False or integer giving the maximum number of references a considered publication is allowed to contain + :type limitRefLength: bool or int + """ for path in [cociteOutpath, timeclusterOutpath, reportsOutpath]: os.makedirs(path) starttime = time.time() diff --git a/src/semanticlayertools/pipelines/wordscorenet.py b/src/semanticlayertools/pipelines/wordscorenet.py index 5996fda..4dd5669 100644 --- a/src/semanticlayertools/pipelines/wordscorenet.py +++ b/src/semanticlayertools/pipelines/wordscorenet.py @@ -1,4 +1,3 @@ -"""Runs all steps to create a multilayer network.""" import tempfile from datetime import datetime import os @@ -19,7 +18,34 @@ def run( ngramsize=5, scoreLimit=1.0 ): - """Run all steps for multilayer network generation using wordscoring.""" + """Run all steps for multilayer network generation using wordscoring. + + Calculates word scoring for corpus documents, creates multilayer network + by linking co-authors, their publications and used ngrams and + calculates clusters for each timeslice using the infomap algorithm. + + By default, temmporal folders are used such that only the found clusters + are returned. + + For details of the ngram method refere to the module documentation. + + :param dataframe: The input corpus dataframe. + :type dataframe: class:`pandas.DataFrame` + :param tempFiles: Use temporal files during the pipeline run. + :type tempFiles: bool + :param outpath: Path for writing resulting cluster data + :type outpath: str + :param textColumn: Column name to use for ngram calculation + :type textColumn: str + :param pubIDColumn: Column name to use for publication identification (assumend to be unique) + :type pubIDColumn: str + :param yearColumn: Column name for temporal ordering publications, used during writing the scoring files + :type yearColumn: str + :param ngramsize: Maximum of considered ngrams (default: 5-gram) + :type ngramsize: int + :param scoreLimit: Minimal weight in the full corpus to consider an ngram score (default: 1.0) + :type scoreLimit: float + """ if tempFiles is True: basedir = tempfile.TemporaryDirectory().name diff --git a/tests/linkage/test_wordscore.py b/tests/linkage/test_wordscore.py index f40a60e..4eb8ce3 100644 --- a/tests/linkage/test_wordscore.py +++ b/tests/linkage/test_wordscore.py @@ -8,6 +8,7 @@ df = pd.read_json(filePath) + class TestCalculateScores(unittest.TestCase): def setUp(self): diff --git a/tox.ini b/tox.ini index 5047f72..1621a60 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,12 @@ deps = commands_pre = python -m spacy download en_core_web_sm commands = pytest {posargs} +[testenv:flake8] +deps = + flake8 +commands = + flake8 --ignore=E501,E402,F401 src/semanticlayertools/ tests/ + [testenv:docs] description = invoke sphinx-build to build the HTML docs basepython = python3.9 From caee4324c89c1ced2bfaab021efb5b0cd88a5e7b Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 28 Feb 2022 14:40:01 +0100 Subject: [PATCH 32/53] add req corpus for readthedocs --- docs/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/requirements.txt b/docs/requirements.txt index c9c7e88..ff2bf99 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,3 +2,4 @@ sphinx sphinx_rtd_theme m2r2 semanticlayertools +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl From 3786649a87c163feb583412731ff911fced5dc3b Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 28 Feb 2022 14:46:15 +0100 Subject: [PATCH 33/53] bump version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index a18cfc9..dacad42 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = semanticlayertools -version = 0.0.4 +version = 0.0.5 author = Malte Vogl author_email = mvogl@mpiwg-berlin.mpg.de description = Create semantic layers using different methods for word linking. From 1702f8f6aa44453b3c12f91748c3ac6977c5f80b Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 28 Feb 2022 15:12:11 +0100 Subject: [PATCH 34/53] add docs req egg install --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index ff2bf99..1aeab91 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,5 @@ +-e . sphinx sphinx_rtd_theme m2r2 -semanticlayertools en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl From 92605e5799c92c2fa28ccf3fc7ca60a275240dde Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Wed, 2 Mar 2022 11:45:32 +0100 Subject: [PATCH 35/53] wip fix small data size vs cpu count --- src/semanticlayertools/linkage/cocitation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/semanticlayertools/linkage/cocitation.py b/src/semanticlayertools/linkage/cocitation.py index e22bed2..4e69587 100644 --- a/src/semanticlayertools/linkage/cocitation.py +++ b/src/semanticlayertools/linkage/cocitation.py @@ -100,6 +100,8 @@ def calculateCoCitation(self, filepath): try: data = pd.read_json(filepath, lines=True).dropna(subset=[self.columnName]) chunk_size = int(data.shape[0] / self.numberProc) + if chunk_size == 0: # Deal with small data samples. + chunk_size = 1 chunks = np.array_split(data, chunk_size) pool = multiprocessing.Pool(processes=self.numberProc) cocitations = pool.map(self.getCombinations, chunks) From 016d02e634fa6685cefa5fc45ee8fdfc525d41e9 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Wed, 2 Mar 2022 11:53:27 +0100 Subject: [PATCH 36/53] upd version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index dacad42..281cf44 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = semanticlayertools -version = 0.0.5 +version = 0.1.1 author = Malte Vogl author_email = mvogl@mpiwg-berlin.mpg.de description = Create semantic layers using different methods for word linking. From 2c1eea51d6d745d712f07b097508fb91fe92b964 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Wed, 2 Mar 2022 14:53:04 +0100 Subject: [PATCH 37/53] wip fix nodeid -> self.publicationIDcolumn, check for module import error --- src/semanticlayertools/clustering/reports.py | 2 +- src/semanticlayertools/visual/utils.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py index d6ba223..fe871fd 100644 --- a/src/semanticlayertools/clustering/reports.py +++ b/src/semanticlayertools/clustering/reports.py @@ -190,7 +190,7 @@ def fullReport(self, cluster): dfCluster = pd.concat(clusterdf, ignore_index=True) basedf = self.clusternodes.query('cluster == @cluster') inputnodes = set(basedf.node.values) - notFound = inputnodes.difference(set(dfCluster.nodeID.values)) + notFound = inputnodes.difference(set(dfCluster[self.publicationIDcolumn].values)) topAuthors = Counter( [x for y in dfCluster[self.authorColumnName].fillna('').values for x in y] ).most_common(20) diff --git a/src/semanticlayertools/visual/utils.py b/src/semanticlayertools/visual/utils.py index 85febac..6985c55 100644 --- a/src/semanticlayertools/visual/utils.py +++ b/src/semanticlayertools/visual/utils.py @@ -6,10 +6,14 @@ import numpy as np from scipy import stats -from sentence_transformers import SentenceTransformer -import umap -import hdbscan -import torch +try: + from sentence_transformers import SentenceTransformer + import umap + import hdbscan + import torch +except ModuleNotFoundError as e: + print('Please install the dependencies for the visualization routines, using `pip install semanticlayertools[embeddml]`.') + raise e smoothing = TypeVar('smoothing', bool, float) From a874d14725295f85562e146f3d666c0cde35465c Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Wed, 2 Mar 2022 15:08:30 +0100 Subject: [PATCH 38/53] fix: text column contains text in list form --- src/semanticlayertools/clustering/reports.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py index fe871fd..531cd35 100644 --- a/src/semanticlayertools/clustering/reports.py +++ b/src/semanticlayertools/clustering/reports.py @@ -97,7 +97,7 @@ def create_corpus(self, dataframe): nlp = spacy.load(mainLanguageCorp) docs = [] - titles = [x[0] for x in dataframe[self.textcolumn].values if type(x) == list] + titles = dataframe[self.textcolumn].values for title in tqdm(titles, leave=False): try: # text pre-processing From fb9b79ebe9c6455eb96bad111bfb63b60a7e5687 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Wed, 2 Mar 2022 15:20:07 +0100 Subject: [PATCH 39/53] fix: author and aff are joined by semicolon --- src/semanticlayertools/clustering/reports.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py index 531cd35..7fe6752 100644 --- a/src/semanticlayertools/clustering/reports.py +++ b/src/semanticlayertools/clustering/reports.py @@ -192,17 +192,19 @@ def fullReport(self, cluster): inputnodes = set(basedf.node.values) notFound = inputnodes.difference(set(dfCluster[self.publicationIDcolumn].values)) topAuthors = Counter( - [x for y in dfCluster[self.authorColumnName].fillna('').values for x in y] - ).most_common(20) + [x for y in [x.split(';') for x in dfCluster[self.authorColumnName].fillna('').values] for x in y] + ).most_common(21) authortext = '' for x in topAuthors: - authortext += f'\t{x[0]}: {x[1]}\n' + if x[0] != '': + authortext += f'\t{x[0]}: {x[1]}\n' topAffils = Counter( - [x for y in dfCluster[self.affiliationColumnName].fillna('').values for x in y] + [x for y in [x.split(';') for x in dfCluster[self.affiliationColumnName].fillna('').values] for x in y] ).most_common(21) affiltext = '' - for x in topAffils[1:]: - affiltext += f'\t{x[0]}: {x[1]}\n' + for x in topAffils: + if x[0] != '': + affiltext += f'\t{x[0]}: {x[1]}\n' print(f'\tFinished base report for cluster {cluster}.') corpus = self.create_corpus(dfCluster) warnings.simplefilter(action='ignore', category=FutureWarning) @@ -281,5 +283,5 @@ def writeReports(self): """Generate reports and write to output path.""" for cluster in tqdm(self.largeClusterList, leave=False): outtext = self.fullReport(cluster) - with open(f'{self.outpath}Cluster_{cluster}.txt', 'w') as file: + with open(f'{self.outpath}/Cluster_{cluster}.txt', 'w') as file: file.write(outtext) From 32d095b592fa7555fd0dcbdba96d41a01f212ffa Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 3 Mar 2022 14:40:51 +0100 Subject: [PATCH 40/53] add option to cluster full graphs --- docs/visual.rst | 1 + src/semanticlayertools/clustering/leiden.py | 14 +++++++++++--- src/semanticlayertools/clustering/reports.py | 3 ++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/visual.rst b/docs/visual.rst index 3a1b6f3..0513ae3 100644 --- a/docs/visual.rst +++ b/docs/visual.rst @@ -36,6 +36,7 @@ the SentenceTransformer approach of SBERT and UMAP. Time consuming method! .. seealso :: `SBERT docs `_ + `UMAP docs `_ diff --git a/src/semanticlayertools/clustering/leiden.py b/src/semanticlayertools/clustering/leiden.py index bd39d9e..7b70ec8 100644 --- a/src/semanticlayertools/clustering/leiden.py +++ b/src/semanticlayertools/clustering/leiden.py @@ -44,8 +44,10 @@ class TimeCluster(): def __init__( self, inpath: str, outpath: str, - resolution: float = 0.003, intersliceCoupling: float = 0.4, + resolution: float = 0.003, + intersliceCoupling: float = 0.4, timerange: tuple = (1945, 2005), + useGC: bool = True, ): starttime = time.time() self.inpath = inpath @@ -61,7 +63,10 @@ def __init__( if os.path.isfile(self.outfile): raise OSError(f'Output file at {self.outfile} exists. Aborting.') - edgefiles = [x for x in os.listdir(inpath) if x.endswith('_GC.net')] + if useGC is True: + edgefiles = [x for x in os.listdir(inpath) if x.endswith('_GC.net')] + elif useGC is False: + edgefiles = [x for x in os.listdir(inpath) if x.endswith('.ncol')] self.graphDict = {} @@ -71,7 +76,10 @@ def __init__( except Exception: raise if timerange[0] <= int(year) <= timerange[1]: - graph = ig.Graph.Read_Pajek(os.path.join(inpath, edgefiles[idx])) + if useGC is True: + graph = ig.Graph.Read_Pajek(os.path.join(inpath, edgefiles[idx])) + elif useGC is False: + graph = ig.Graph.Read_Ncol(os.path.join(inpath, edgefiles[idx])) self.graphDict[year] = graph self.optimiser = la.Optimiser() diff --git a/src/semanticlayertools/clustering/reports.py b/src/semanticlayertools/clustering/reports.py index 7fe6752..30be990 100644 --- a/src/semanticlayertools/clustering/reports.py +++ b/src/semanticlayertools/clustering/reports.py @@ -52,7 +52,8 @@ def __init__( authorColumnName: str = 'author', affiliationColumnName: str = 'aff', publicationIDcolumn: str = 'nodeID', - numberProc: int = num_processes, minClusterSize: int = 1000, + numberProc: int = num_processes, + minClusterSize: int = 1000, timerange: tuple = (1945, 2005) ): """Constructor method""" From 788f07f072c27678e2bab2223623ec35bb5eced5 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 3 Mar 2022 18:06:08 +0100 Subject: [PATCH 41/53] include generateTree add authors, tox dep include embeddml --- AUTHORS.rst | 14 + docs/authors.rst | 1 + docs/index.rst | 1 + docs/requirements.txt | 2 +- docs/visual.rst | 16 +- setup.cfg | 3 +- .../visual/generateCitationTree.py | 705 ++++++++++++++++++ tox.ini | 3 +- 8 files changed, 741 insertions(+), 4 deletions(-) create mode 100644 AUTHORS.rst create mode 100644 docs/authors.rst create mode 100644 src/semanticlayertools/visual/generateCitationTree.py diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 0000000..184b5ec --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,14 @@ +======= +Credits +======= + +Development Lead +---------------- + +* Malte Vogl + +Contributors +------------ + +* Ira Kokoshko +* Robert Egel diff --git a/docs/authors.rst b/docs/authors.rst new file mode 100644 index 0000000..e122f91 --- /dev/null +++ b/docs/authors.rst @@ -0,0 +1 @@ +.. include:: ../AUTHORS.rst diff --git a/docs/index.rst b/docs/index.rst index 64da186..22a566c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -26,6 +26,7 @@ funded by the Federal Ministry of Education and Research, Germany (Grant No. 01 linkage clustering visual + authors license diff --git a/docs/requirements.txt b/docs/requirements.txt index 1aeab91..e5f708f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ --e . +-e .[embeddml] sphinx sphinx_rtd_theme m2r2 diff --git a/docs/visual.rst b/docs/visual.rst index 0513ae3..09e7e89 100644 --- a/docs/visual.rst +++ b/docs/visual.rst @@ -36,7 +36,7 @@ the SentenceTransformer approach of SBERT and UMAP. Time consuming method! .. seealso :: `SBERT docs `_ - + `UMAP docs `_ @@ -57,3 +57,17 @@ HDBSCAN clustering. Reuses previously generated embedding of corpus. .. seealso :: `HDBSCAN docs `_ + + +Generate citation and reference tree graph +****************************************** + +Using the Dimensions AI dataset, this routine generates a structure +starting from a source publications, that represents its references and their +references as well as its citations and their citations. With this means, +visualizations of it show academic roots and conduits and can display +disciplinary pathways. + +.. automodule:: semanticlayertools.visual.generateCitationTree + :members: + :undoc-members: diff --git a/setup.cfg b/setup.cfg index 281cf44..93a474f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = semanticlayertools -version = 0.1.1 +version = 0.1.3 author = Malte Vogl author_email = mvogl@mpiwg-berlin.mpg.de description = Create semantic layers using different methods for word linking. @@ -42,6 +42,7 @@ dev = tox sphinx embeddml = + dimcli torch umap-learn hdbscan diff --git a/src/semanticlayertools/visual/generateCitationTree.py b/src/semanticlayertools/visual/generateCitationTree.py new file mode 100644 index 0000000..68ef71a --- /dev/null +++ b/src/semanticlayertools/visual/generateCitationTree.py @@ -0,0 +1,705 @@ +""" +generateCitationTree contains classes for generating citation trees for a single DOI. + +https://app.dimensions.ai is used in favor of CrossRef, as it contains richer information. +Requires dimensions.ai API access. +Results are compatible to the existing GenerateCitationNetwork module. +""" +import os +import dimcli +import pandas as pd +import numpy as np +import json +from tqdm import tqdm +import math +from typing import Dict, Tuple, List +import datetime +import time +import re +from requests.exceptions import HTTPError + +# type aliases +Doi = str +PubID = str +FilePath = str + + +class generate: + """GenerateCitationNet makes citation/reference networks for a document. + + For a given input document, its references and citations are evaluated. In + a second step, citations of citations and references of references are extracted. + This information is used to generate a tree like network. + """ + + def __init__( + self, + verbose: bool = False, + api_key="", + use_expanded_target_references: bool = False, + ): + """ + __init__ instantiates citation network generator. + + :param verbose: forwarded to dimcli queries, defaults to False + :type verbose: bool, optional + :param api_key: dimensions.ai API key, tries to use dsl.ini if not existent, defaults to "" + :type api_key: str, optional + :param use_expanded_target_references: whether or not to use indirect connections + (not through input node) to make network edges + :type use_expanded_target_references: bool, optional + """ + while not dimcli.login_status(): + try: + dimcli.login(key=api_key) + except HTTPError as e: + if e.response.status_code == 401: + raise + time.sleep(5) + pass + + self.dsl: dimcli.Dsl = dimcli.Dsl() + self._verbose: bool = verbose + self.startDoi: Doi = "" + self.stringClean = {r"\s": "__", "/": "_slash_", ":": "_colon_", r"\.": "_dot_"} + self._make_hairball = use_expanded_target_references + + def fetchPubsByIDs( + self, pubIDs: List[PubID], authors: bool = True + ) -> Tuple[bool, pd.DataFrame]: + """ + Fetch publications from dimcli using PubIDs defined by dimensions.ai. + + :param pubIDs: list of PubIDs (string type alias) + :type pubIDs: List[PubID] + :param authors: whether to fetch author information, defaults to True + :type authors: bool, optional + :return: status-bool (True if everything is okay), dataframe containing + required information for input publications + :rtype: Tuple[bool, pd.DataFrame] + """ + if self._verbose: + print(f"hi, this is fetchPubsByIDs() for pubs = {pubIDs}") + + if authors: + query = f""" + search publications + where id in {json.dumps(pubIDs)} + return publications[id+doi+title+category_for+year + +reference_ids+authors+journal_title_raw+times_cited] + limit {len(pubIDs)} + """ + else: + query = f""" + search publications + where id in {json.dumps(pubIDs)} + return publications[id+doi+title+category_for+year + +reference_ids+journal_title_raw+times_cited] + limit {len(pubIDs)} + """ + + dsl_data = self.dsl.query(query, verbose=self._verbose) + + df = dsl_data.as_dataframe() + + try: + df["target_refs"] = df["reference_ids"] + except (TypeError, KeyError): + return False, pd.DataFrame() + + # replace NaN with empty list + df["target_refs"] = df["target_refs"].apply( + lambda target_ref: [] if type(target_ref) == float else target_ref + ) + + if not authors: + df["authors"] = [np.nan] * len(df) + + return True, df + + def fetchPubsByDois( + self, dois: List[Doi], authors: bool = True + ) -> Tuple[bool, pd.DataFrame]: + """ + Fetch publications from dimcli using DOIs. + + :param dois: list of DOIs (string type alias) + :type dois: List[Doi] + :param authors: whether to fetch author information, defaults to True + :type authors: bool, optional + :return: status-bool (True if everything is okay), dataframe containing + required information for input publications + :rtype: Tuple[bool, pd.DataFrame] + """ + if self._verbose: + print(f"hi, this is fetchOrigin() for doi = {dois}") + + if authors: + query = f""" + search publications + where doi in {json.dumps(dois)} + return publications[id+doi+title+category_for+year + +reference_ids+authors+journal_title_raw+times_cited] + """ + else: + query = f""" + search publications + where doi in {json.dumps(dois)} + return publications[id+doi+title+category_for+year + +reference_ids+journal_title_raw+times_cited] + """ + + dsl_data = self.dsl.query(query, verbose=self._verbose) + + df = dsl_data.as_dataframe() + try: + df["target_refs"] = df["reference_ids"] + except (TypeError, KeyError): + return False, pd.DataFrame() + + # replace NaN with empty list + df["target_refs"] = df["target_refs"].apply( + lambda target_ref: [] if type(target_ref) == float else target_ref + ) + + if not authors: + df["authors"] = [np.nan] * len(df) + + return True, df + + def fetchCitations( + self, pubIDs: List[PubID], authors: bool = False + ) -> Tuple[bool, pd.DataFrame]: + """ + Fetch citing publications for a list of publications using their PubIDs. + + :param pubIDs: list of PubIDs (string type alias) + :type pubIDs: List[PubID] + :param authors: whether to fetch author information, defaults to False + :type authors: bool, optional + :return: status-bool (True if everything is okay), dataframe containing + required information for citing publications + :rtype: Tuple[bool, pd.DataFrame] + """ + if self._verbose: + print(f"hi, this is fetchCitations() for pubs = {pubIDs}") + + dfs = [] + if math.ceil(len(pubIDs) / 512) > 1: + __range__ = tqdm(range(math.ceil(len(pubIDs) / 512))) + else: + __range__ = range(math.ceil(len(pubIDs) / 512)) + + for i in __range__: + # dimcli queries are limited to 512 entites per list for `in` filtering + offset = i * 512 + + if authors: + query = f""" + search publications + where reference_ids in {json.dumps(pubIDs[offset:offset+512])} + return publications[id+doi+title+category_for+year + +reference_ids+authors+journal_title_raw+times_cited] + """ + else: + query = f""" + search publications + where reference_ids in {json.dumps(pubIDs[offset:offset+512])} + return publications[id+doi+title+category_for+year + +reference_ids+journal_title_raw+times_cited] + """ + + dsl_data = self.dsl.query_iterative(query, verbose=self._verbose) + tmp = dsl_data.as_dataframe() + + try: + _ = tmp["reference_ids"] + except (TypeError, KeyError): + return False, pd.DataFrame() + + dfs.append(tmp) + + df = pd.concat(dfs) + # intersection of input pubIDs and references of each publication + df["target_refs"] = df["reference_ids"].apply( + lambda row_refs: list(set(pubIDs) & set(row_refs)) + ) + + if not authors: + df["authors"] = [np.nan] * len(df) + + return True, df + + def fetchReferences( + self, pubIDs: List[PubID], authors: bool = True + ) -> Tuple[bool, pd.DataFrame]: + """ + Fetch references for a list of publications using their PubIDs as defined by dimensions.ai. + + :param pubIDs: list of PubIDs (string type alias) + :type pubIDs: List[PubID] + :param authors: whether to fetch author information, defaults to True + :type authors: bool, optional + :return: status-bool (True if everything is okay), dataframe containing + required information for references + :rtype: Tuple[bool, pd.DataFrame] + """ + if self._verbose: + print(f"hi, this is fetchReferences() for pubs = {pubIDs}") + + dfs = [] + + if math.ceil(len(pubIDs) / 512) > 1: + __range__ = tqdm(range(math.ceil(len(pubIDs) / 512))) + else: + __range__ = range(math.ceil(len(pubIDs) / 512)) + + # get references (PubID) of given PubIDs + for i in __range__: + # dimcli queries are limited to 512 entites per list for `in` filtering + offset = i * 512 + + query = f""" + search publications + where id in {json.dumps(pubIDs[offset:offset + 512])} + return publications[id+reference_ids] + limit 512 + """ + + dsl_data = self.dsl.query(query, verbose=self._verbose) + tmp = dsl_data.as_dataframe() + + try: + _ = tmp["reference_ids"] + except (TypeError, KeyError): + return False, pd.DataFrame() + + dfs.append(tmp) + + df0 = pd.concat(dfs) + + # flatten list of references + # List[List[PubID]] -> List[PubID] + refs = [x for x in df0["reference_ids"].dropna().to_list() for x in x] + + # drop duplicates + refs = list(set(refs)) + + dfs = [] + if math.ceil(len(refs) / 512) > 1: + __range__ = tqdm(range(math.ceil(len(refs) / 512))) + else: + __range__ = range(math.ceil(len(refs) / 512)) + + for i in __range__: + # dimcli queries are limited to 512 entites per list for `in` filtering + offset = i * 512 + ok, df = self.fetchPubsByIDs(refs[offset : offset + 512], authors=authors) + if ok: + dfs.append(df) + else: # pragma: no cover + # cannot be reached unless dimensions database is malicious + return False, pd.DataFrame() + + return True, pd.concat(dfs) + + def run( + self, doi: Doi, levels_ref: int = 2, levels_cite: int = 2, authors: bool = False + ) -> Tuple[bool, pd.DataFrame]: + """ + Generate citation network for a publication using its DOI. + + :param doi: input DOI (string type alias) + :type doi: Doi + :param levels_ref: number of levels for references, defaults to 2 + :type levels_ref: int, optional + :param levels_cite: number of levels for citing publications, defaults to 2 + :type levels_cite: int, optional + :param authors: whether to include author information, defaults to False + :type authors: bool, optional + :return: status-bool (True if everything is okay), dataframe containing + required information for input publications, references and citing publications + :rtype: Tuple[bool, pd.DataFrame] + """ + if hasattr(self, "result_df"): + return True, self.result_df + + self.startDoi = doi + + dfs = [] + + print("level 0") + ok, df_origin = self.fetchPubsByDois([doi], authors) + if not ok: + print(f"could not fetch publication for DOI {doi}") + return False, pd.DataFrame() + + df_origin["level"] = 0 + dfs.append(df_origin) + + ok, dfs_cite = self._fetchCite(df_origin, levels_cite, authors) + ok, dfs_ref = self._fetchRef(df_origin, levels_ref, authors) + + dfs.extend(dfs_cite + dfs_ref) + + self.result_df: pd.DataFrame = pd.concat(dfs).reset_index(drop=True) + + # cleaning + # self.result_df = self.dropDuplicates(self.result_df) + self.result_df["first_author"] = self.result_df["authors"].apply( + lambda authors: authors[0]["last_name"] if type(authors) == list else "" + ) + self.result_df["ref_count"] = self.result_df["reference_ids"].apply( + lambda refs: len(refs) if type(refs) == list else None + ) + self.result_df.index = self.result_df["id"] + + self.main_node = df_origin.iloc[0].copy() + if type(self.main_node["authors"]) == list: + self.main_node["first_author"] = self.main_node["authors"][0]["last_name"] + else: + self.main_node["first_author"] = "" + + self.result_df["main_category_for"] = self.result_df["category_for"].apply( + lambda c: [ + x["name"] + for x in filter(lambda dict_: re.match(r"^\d\d\s", dict_["name"]), c) + ][0] + if type(c) == list + else "" + ) + + # replace NaN in reference_ids with empty list + self.result_df["reference_ids"] = self.result_df["reference_ids"].apply( + lambda target_ref: [] if type(target_ref) == float else target_ref + ) + + # include expanded target refs + # (intersection of references of a paper and all listed publications, + # e.g. main_node cites A, Y cites main_node and A + # -> target_refs does not contain connection from Y to A) + all_pubs = set(self.result_df.index) + self.result_df["expanded_target_refs"] = self.result_df["reference_ids"].apply( + lambda reference_ids: list(all_pubs.intersection(reference_ids)) + ) + + return True, self.result_df + + def _fetchCite( + self, df_origin: pd.DataFrame, levels: int, authors: bool + ) -> Tuple[bool, List[pd.DataFrame]]: + dfs_cite = [] + + pubIDs = df_origin["id"].to_list() + for i in range(levels): + print(f"level {i + 1}, fetching citations for {len(pubIDs)} publications") + ok, tmp = self.fetchCitations(pubIDs, authors) + if ok: + pubIDs = tmp["id"].to_list() + tmp["level"] = i + 1 + dfs_cite.append(tmp) + else: # pragma: no cover + # cannot be reached unless dimensions database is malicious + return False, pd.DataFrame() + return True, dfs_cite + + def _fetchRef( + self, df_origin: pd.DataFrame, levels: int, authors: bool + ) -> Tuple[bool, List[pd.DataFrame]]: + dfs_ref = [] + pubIDs = df_origin["id"].to_list() + for i in range(levels): + print( + f"level {(i + 1) * (-1)}, fetching references for {len(pubIDs)} publications" + ) + ok, tmp = self.fetchReferences(pubIDs, authors) + if ok: + pubIDs = tmp["id"].to_list() + tmp["level"] = (i + 1) * (-1) + dfs_ref.append(tmp) + else: # pragma: no cover + # cannot be reached unless dimensions database is malicious + return False, pd.DataFrame() + return True, dfs_ref + + def _makeCompatibleRefDf( + self, df: pd.DataFrame, use_expanded: bool = False + ) -> pd.DataFrame: + """ + Reformat references dataframe to match prior versions formatting. + + :param df: dataframe as generated by .fetchReferences() + :type df: pd.DataFrame + :return: compatible dataframe + :rtype: pd.DataFrame + """ + levels_ref = min(df["level"]) + + # flatten references + if use_expanded: + target_ref_type = "expanded_target_refs" + else: + target_ref_type = "target_refs" + + ref_tuples = { + (row["id"], ref) + for _, row in df.query(f"{levels_ref} < level <= 0").iterrows() + for ref in row[target_ref_type] + } + + df = df[~df.index.duplicated(keep="first")] + + refs = [] + for (source_id, target_id) in ref_tuples: + source = df.loc[source_id] + target = df.loc[target_id] + + refs.append( + { + "type": "reference", + "sourceYear": source["year"], + "sourceDOI": source["doi"], + "sourcePubID": source["id"], + "sourceJournal": source["journal_title_raw"], + "targetFull": "", + "targetYear": target["year"], + "targetDOI": target["doi"], + "targetPubID": target["id"], + "targetrefCount": target["ref_count"], + "targetis_ref_byCount": target["times_cited"], + "targettitleStr": target["title"], + "targetFirstAuthor": target["first_author"], + "targetJournal": target["journal_title_raw"], + "targetSubject": target["category_for"], + } + ) + + return pd.DataFrame(refs) + + def _makeCompatibleCiteDf( + self, df: pd.DataFrame, use_expanded: bool = False + ) -> pd.DataFrame: + """ + Reformat citation dataframe to match prior versions formatting. + + :param df: dataframe as generated by .fetchCitations() + :type df: pd.DataFrame + :return: compatible dataframe + :rtype: pd.DataFrame + """ + levels_cite = max(df["level"]) + + # flatten citations + if use_expanded: + target_ref_type = "expanded_target_refs" + else: + target_ref_type = "target_refs" + + cite_tuples = { + (row["id"], ref) + for _, row in df.query(f"{levels_cite} >= level > 0").iterrows() + for ref in row[target_ref_type] + } + + df = df[~df.index.duplicated(keep="first")] + + cites = [] + for (source_id, target_id) in cite_tuples: + source = df.loc[source_id] + target = df.loc[target_id] + + cites.append( + { + "type": "citation", + "targetPubID": target["id"], + "targetYear": target["year"], + "targetDOI": target["doi"], + "targetJournal": target["journal_title_raw"], + "sourceYear": source["year"], + "sourceDOI": source["doi"], + "sourcePubID": source["id"], + "sourcerefCount": source["ref_count"], + "sourceis_ref_byCount": source["times_cited"], + "sourcetitleStr": source["title"], + "sourceFirstAuthor": source["first_author"], + "sourceJournal": source["journal_title_raw"], + "sourceSubject": source["category_for"], + } + ) + + return pd.DataFrame(cites) + + def makeCompatibleDf(self) -> Tuple[bool, pd.DataFrame]: + """ + Reformat dataframe to match prior versions formatting. + + :param df: dataframe as generated by .run() + :type df: pd.DataFrame + :return: compatible dataframe + :rtype: pd.DataFrame + """ + if not hasattr(self, "result_df"): + print("you gotta run .run() first") + return False, pd.DataFrame() + + if hasattr(self, "compatible_result_df"): + return True, self.compatible_result_df + + df_ref = self._makeCompatibleRefDf(self.result_df, self._make_hairball) + df_cite = self._makeCompatibleCiteDf(self.result_df, self._make_hairball) + + self.compatible_result_df: pd.DataFrame = pd.concat( + [df_cite, df_ref], ignore_index=True + ).fillna("") + + return True, self.compatible_result_df + + def runCompatible( + self, + doi: Doi, + level: int = 2, + direct: str = "both", + debug: bool = False, + ) -> Tuple[bool, pd.DataFrame]: + """ + Wrap .run() with same parameters and outputs as prior versions. + + :param doi: input DOI (string type alias) + :type doi: Doi + :param level: number of levels to fetch, defaults to 2 + :type level: int, optional + :param direct: direction of search (either "ref", "cite" or "both"), defaults to "both" + :type direct: str, optional + :param debug: [description], defaults to False + :type debug: bool, optional + :return: [description] + :rtype: Tuple[bool, pd.DataFrame] + """ + if direct == "ref": + ok, df = self.run(doi, levels_ref=level, levels_cite=0) + elif direct == "cite": + ok, df = self.run(doi, levels_ref=0, levels_cite=level) + elif direct == "both": + ok, df = self.run(doi, levels_ref=level, levels_cite=level) + else: + print("provide proper direction of search (either `ref`, `cite` or `both`)") + return False, pd.DataFrame() + + if not ok: + return False, pd.DataFrame() + + ok, comp_df = self.makeCompatibleDf() + if ok: + return True, comp_df + else: + return False, pd.DataFrame() + + def _nodeDict(self, row: pd.Series) -> Dict: + # row = row.fillna("") + + if row["doi"].lower() == self.startDoi.lower(): + inputDOI = "True" + else: + inputDOI = "False" + res = { + # "label": nodeName, + # "x": 0, + # "y": 0, + "id": row["id"], + "attributes": { + # "name": nodeName, + "title": row["title"], + "doi": row["doi"], + "nodeyear": row["year"], + "ref-by-count": row["times_cited"], + "is_input_DOI": inputDOI, + "category_for": row["main_category_for"], + "level": row["level"], + }, + # "color": "rgb(0,0,0)", + # "size": 10 + } + return res + + def _edgeDict(self, row: pd.Series) -> Dict: + # row = row.fillna("") + + res = { + "source": row["sourcePubID"], + "target": row["targetPubID"], + # "id": idx, + "attributes": {"year": row["sourceYear"], "type": row["type"]}, + # "color": "rgb(0,0,0)", + # "size": 1 + } + return res + + def _createFilename(self, ext: str = "json") -> FilePath: + filename = self.startDoi + date = datetime.datetime.now().strftime("%Y-%m-%d") + for key, val in self.stringClean.items(): + filename = re.sub(key, val, filename) + if self._make_hairball: + path = f"{self.main_node['first_author']}_{filename}_date_{date}_hairball.{ext}" + else: + path = f"{self.main_node['first_author']}_{filename}_date_{date}.{ext}" + return path + + def createJSON(self, outputPath: FilePath = "./out") -> Tuple[bool, FilePath]: + """ + Create JSON file on disk containing network as lists of nodes and edges for visualization. + + :param outputPath: output directory, defaults to "./out" + :type outputPath: FilePath, optional + :return: status-bool (True if everything is okay), path of JSON file + :rtype: Tuple[bool, FilePath] + """ + if not hasattr(self, "result_df"): + print("You need to use .run() first to create some data to write.") + return False, "" + + if not hasattr(self, "compatible_result_df"): + self.makeCompatibleDf() + + allNodes = [ + x + for _, x in self.result_df[~self.result_df.index.duplicated()] + .fillna("") + .iterrows() + ] + allRows = [x for x in self.compatible_result_df.fillna("").iterrows()] + + outputPath = os.path.abspath(outputPath) + if not os.path.exists(outputPath): + os.mkdir(outputPath) + + with open(f"{outputPath}/{self._createFilename()}", "w") as outFile: + # write nodes + outFile.write('{\n "nodes": [\n') + + # write nodes from compatible_result_df/allNodes + while allNodes: + node = allNodes.pop() + if len(allNodes) == 0: + outFile.write(json.dumps(self._nodeDict(node)) + "\n") + else: + outFile.write(json.dumps(self._nodeDict(node)) + ",\n") + + # write edges + outFile.write(' ],\n "edges":[') + while allRows: + idx, edge = allRows.pop() + if len(allRows) == 0: + outFile.write(json.dumps(self._edgeDict(edge)) + "\n") + else: + x = self._edgeDict(edge) + outFile.write(json.dumps(x) + ",\n") + outFile.write(" ]\n}") + + return True, f"{outputPath}/{self._createFilename()}" + + def logout(self) -> None: + """ + Dimcli logout. + """ + dimcli.logout() diff --git a/tox.ini b/tox.ini index 1621a60..e7173e7 100644 --- a/tox.ini +++ b/tox.ini @@ -11,7 +11,7 @@ testpaths = [testenv] deps = pytest - -rrequirements.txt + -e ./[embeddml] commands_pre = python -m spacy download en_core_web_sm commands = pytest {posargs} @@ -25,6 +25,7 @@ commands = description = invoke sphinx-build to build the HTML docs basepython = python3.9 deps = + -e ./[embeddml] sphinx sphinx_rtd_theme m2r2 From 0d4371c1ec2f51436e5cd763ccab4d4d6f95958e Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 7 Mar 2022 16:05:08 +0100 Subject: [PATCH 42/53] wip updt org --- src/semanticlayertools/visual/generateCitationTree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/semanticlayertools/visual/generateCitationTree.py b/src/semanticlayertools/visual/generateCitationTree.py index 68ef71a..69c7b04 100644 --- a/src/semanticlayertools/visual/generateCitationTree.py +++ b/src/semanticlayertools/visual/generateCitationTree.py @@ -24,7 +24,7 @@ FilePath = str -class generate: +class generateTree: """GenerateCitationNet makes citation/reference networks for a document. For a given input document, its references and citations are evaluated. In @@ -310,7 +310,7 @@ def run( Generate citation network for a publication using its DOI. :param doi: input DOI (string type alias) - :type doi: Doi + :type doi: Drunoi :param levels_ref: number of levels for references, defaults to 2 :type levels_ref: int, optional :param levels_cite: number of levels for citing publications, defaults to 2 From 69ea04d226765c10f15bcb9dca4d18048105673e Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Tue, 15 Mar 2022 15:13:34 +0100 Subject: [PATCH 43/53] add citationet working data generation --- src/semanticlayertools/visual/citationnet.py | 235 +++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 src/semanticlayertools/visual/citationnet.py diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py new file mode 100644 index 0000000..6201657 --- /dev/null +++ b/src/semanticlayertools/visual/citationnet.py @@ -0,0 +1,235 @@ +import dimcli +import json +import numpy as np +import pandas as pd +from tqdm import tqdm +import time +from collections import Counter +from requests.exceptions import HTTPError + + +class GenerateTree: + """Generate tree for citationent visualization. + + For a given input document, its references and citations are evaluated. In + a second step, citations of citations and references of references are + extracted. This information is used to generate a tree like network for + visualization. + """ + + def __init__(self, verbose: bool = False, api_key=""): + """Init module.""" + while not dimcli.login_status(): + try: + dimcli.login(key=api_key) + except HTTPError as e: + if e.response.status_code == 401: + raise + time.sleep(5) + pass + + self.dsl: dimcli.Dsl = dimcli.Dsl() + self._verbose = verbose + self.startDoi: str = "" + self.citationLimit: int = 100 + self.dataframeList = [] + + self.stringClean = { + r"\s": "__", + "/": "_slash_", + ":": "_colon_", + r"\.": "_dot_" + } + + def _formatFOR(self, row): + """Format existing FOR codes. + + Each publication has a total value of one. Only first level parts of + codes are counted. If no FOR code exist, return '00:1'. + + Example: "02, 0201, 0204, 06" yields "02:0.75;06:025" + """ + try: + inputForcodes = [x['name'][:2] for x in row] + forcodes = ';'.join( + [f'{x[0]}:{x[1]/len(inputForcodes):.2f}' for x in Counter( + inputForcodes + ).most_common()] + ) + except TypeError: + forcodes = '00:1' + return forcodes + + def _editDF(self, inputdf, dftype='cite_l1', level2List=None): + """Return reformated dataframe. """ + retCols = ['source', 'target', 'doi', 'year', 'title', 'times_cited', 'forcodes', 'level', 'is_input'] + formatedFOR = inputdf.category_for.apply(lambda row: self._formatFOR(row)) + inputdf.insert(0, 'forcodes', formatedFOR) + inputdf.drop(['category_for'], axis=1, inplace=True) + inputdf.rename(columns={'id': 'source'}, inplace=True) + if dftype in ['ref_l1', 'cite_l2', 'ref_l2']: + outdf = inputdf.explode('reference_ids') + outdf.rename(columns={'reference_ids': 'target'}, inplace=True) + if dftype == 'cite_l2': + outdf = outdf.query('target.isin(@level2List)') + elif dftype == 'cite_l1': + inputdf.insert(0, 'target', self.pubids) + outdf = inputdf.copy() + outdf.insert(0, 'level', dftype) + outdf = outdf.dropna(subset=['source', 'target']) + outdf.insert( + 0, + 'is_input', + outdf.source.apply(lambda x: x == self.pubids) + ) + return outdf[retCols] + + def _getMissing(self, idlist): + """Get metadata for second level reference nodes.""" + retCols = ['source', 'doi', 'year', 'title', 'times_cited', 'forcodes', 'level', 'is_input'] + dfList = [] + if len(idlist) > 512: + for partlist in tqdm(np.array_split(idlist, round(len(idlist)/400))): + res = self.dsl.query_iterative( + f"""search + publications + where + id in {json.dumps(list(partlist))} + return + publications[id+doi+times_cited+category_for+title+year] + """, + verbose=self._verbose + ) + dfList.append(res.as_dataframe()) + retDF = pd.concat(dfList) + else: + res = self.dsl.query_iterative( + f"""search + publications + where + id in {json.dumps(list(idlist))} + return + publications[id+doi+times_cited+category_for+title+year] + """, + verbose=self._verbose + ) + retDF = res.as_dataframe() + formatedFOR = retDF.category_for.apply(lambda row: self._formatFOR(row)) + retDF.insert(0, 'forcodes', formatedFOR) + retDF.drop(['category_for'], axis=1, inplace=True) + retDF.rename(columns={'id': 'source'}, inplace=True) + retDF.insert(0, 'level', 'ref_l2') + retDF.insert(0, 'is_input', False) + return retDF[retCols] + + def query(self, startDoi=''): + self.startDoi = startDoi + starttime = time.time() + doi2id = self.dsl.query( + f"""search + publications + where + doi = "{startDoi}" and times_cited <= {self.citationLimit} + return + publications[id+doi+times_cited+category_for+title+year+reference_ids] + """, + verbose=self._verbose + ) + querydf = doi2id.as_dataframe() + if querydf.shape[0] > 0: + self.pubids = querydf['id'].values[0] + self.pubrefs = list( + [x for y in querydf['reference_ids'].values for x in y] + ) + self.dataframeList.append( + self._editDF(querydf, dftype="ref_l1") + ) + ref1trgtList = list(self.dataframeList[0].target.values) + cit1df = self.dsl.query_iterative( + f"""search + publications + where + reference_ids = "{self.pubids}" + return + publications[id+doi+times_cited+category_for+title+year+reference_ids] + """, + verbose=self._verbose) + self.dataframeList.append( + self._editDF(cit1df.as_dataframe(), dftype='cite_l1') + ) + cit1SrcList = list(self.dataframeList[1].source.values) + cit2df = self.dsl.query_iterative( + f"""search + publications + where + reference_ids in {json.dumps(cit1SrcList)} + return + publications[id+doi+times_cited+category_for+title+year+reference_ids]""", + verbose=self._verbose + ) + self.dataframeList.append( + self._editDF(cit2df.as_dataframe(), dftype='cite_l2', level2List=cit1SrcList) + ) + ref2df = self.dsl.query_iterative( + f"""search + publications + where + id in {json.dumps(ref1trgtList)} + return + publications[id+doi+times_cited+category_for+title+year+reference_ids]""", + verbose=self._verbose + ) + self.dataframeList.append( + self._editDF(ref2df.as_dataframe(), dftype='ref_l2') + ) + print(f'Finished queries in {time.time() - starttime} seconds.') + return self + else: + print('The requested DOI is cited to often.') + + def returnLinks(self): + return pd.concat(self.dataframeList) + + def generateNetworkFiles(self, outpath): + starttime = time.time() + outformat = {'nodes': [], 'edges': []} + dflinks = pd.concat(self.dataframeList) + srcNodes = dflinks.source.unique() + trgNodes = [x for x in dflinks.target.unique() if x not in srcNodes] + nodeMetadata = pd.concat( + [ + dflinks.drop('target', axis=1).drop_duplicates(), + self._getMissing(trgNodes) + ] + ) + for idx, row in nodeMetadata.iterrows(): + outformat['nodes'].append( + { + 'id': row['source'], + 'attributes': + { + "title": row["title"], + "doi": row["doi"], + "nodeyear": row["year"], + "ref-by-count": row["times_cited"], + "is_input_DOI": row['is_input'], + "category_for": row["forcodes"], + 'level': row['level'] + } + } + ) + for idx, row in dflinks.iterrows(): + outformat['edges'].append( + { + 'source': row['source'], + 'target': row['target'], + 'attributes': + { + 'year': row['year'], + 'level': row['level'] + } + } + ) + with open(outpath, 'w') as outfile: + json.dump(outformat, outfile, indent=4) + return f'Finished querying extra metadata in {time.time() - starttime} seconds.' From eaa2d0fdef4e7bc68b93d844ba9097ba8fbd6218 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 17 Mar 2022 18:00:43 +0100 Subject: [PATCH 44/53] wip add cleaning routine for title strings to make data json friendly --- src/semanticlayertools/visual/citationnet.py | 31 +++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py index 6201657..602fd93 100644 --- a/src/semanticlayertools/visual/citationnet.py +++ b/src/semanticlayertools/visual/citationnet.py @@ -4,6 +4,8 @@ import pandas as pd from tqdm import tqdm import time +import re +import os from collections import Counter from requests.exceptions import HTTPError @@ -41,6 +43,19 @@ def __init__(self, verbose: bool = False, api_key=""): r"\.": "_dot_" } + def _cleanTitleString(self, row): + """Clean non-JSON characters from titles. + + Removes newline characters and double backslashes. + """ + try: + title = row + for pair in [('\n', ' '), (r':?\\+', '')]: + title = re.sub(pair[0], pair[1], title) + return title + except Exception: + return 'Can not process title.' + def _formatFOR(self, row): """Format existing FOR codes. @@ -82,6 +97,9 @@ def _editDF(self, inputdf, dftype='cite_l1', level2List=None): 'is_input', outdf.source.apply(lambda x: x == self.pubids) ) + cleantitle = outdf.title.apply(lambda row: self._cleanTitleString(row)) + outdf.drop('title', axis=1, inplace=True) + outdf.insert(0, 'title', cleantitle) return outdf[retCols] def _getMissing(self, idlist): @@ -190,7 +208,7 @@ def query(self, startDoi=''): def returnLinks(self): return pd.concat(self.dataframeList) - def generateNetworkFiles(self, outpath): + def generateNetworkFiles(self, outfolder): starttime = time.time() outformat = {'nodes': [], 'edges': []} dflinks = pd.concat(self.dataframeList) @@ -208,7 +226,7 @@ def generateNetworkFiles(self, outpath): 'id': row['source'], 'attributes': { - "title": row["title"], + "title": row['title'], "doi": row["doi"], "nodeyear": row["year"], "ref-by-count": row["times_cited"], @@ -230,6 +248,11 @@ def generateNetworkFiles(self, outpath): } } ) - with open(outpath, 'w') as outfile: - json.dump(outformat, outfile, indent=4) + doiname = self.startDoi + for key, val in self.stringClean.items(): + doiname = re.sub(key, val, doiname) + + outfile = os.path.join(outfolder, doiname + '.json') + with open(outfile, 'w', encoding="utf8") as ofile: + json.dump(outformat, ofile, ensure_ascii=False) return f'Finished querying extra metadata in {time.time() - starttime} seconds.' From b16514816d6b3ee21c41b2b1237b448012bd4115 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Fri, 18 Mar 2022 15:07:51 +0100 Subject: [PATCH 45/53] add cleaning of " --- src/semanticlayertools/visual/citationnet.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py index 602fd93..f72745d 100644 --- a/src/semanticlayertools/visual/citationnet.py +++ b/src/semanticlayertools/visual/citationnet.py @@ -50,7 +50,7 @@ def _cleanTitleString(self, row): """ try: title = row - for pair in [('\n', ' '), (r':?\\+', '')]: + for pair in [('\n', ' '), (r':?\\+', ''), ('"', '')]: title = re.sub(pair[0], pair[1], title) return title except Exception: @@ -136,8 +136,11 @@ def _getMissing(self, idlist): retDF.insert(0, 'forcodes', formatedFOR) retDF.drop(['category_for'], axis=1, inplace=True) retDF.rename(columns={'id': 'source'}, inplace=True) - retDF.insert(0, 'level', 'ref_l2') + retDF.insert(0, 'level', 'ref_l3') retDF.insert(0, 'is_input', False) + cleantitle = retDF.title.apply(lambda row: self._cleanTitleString(row)) + retDF.drop('title', axis=1, inplace=True) + retDF.insert(0, 'title', cleantitle) return retDF[retCols] def query(self, startDoi=''): @@ -220,7 +223,7 @@ def generateNetworkFiles(self, outfolder): self._getMissing(trgNodes) ] ) - for idx, row in nodeMetadata.iterrows(): + for idx, row in nodeMetadata.fillna('').iterrows(): outformat['nodes'].append( { 'id': row['source'], @@ -236,7 +239,7 @@ def generateNetworkFiles(self, outfolder): } } ) - for idx, row in dflinks.iterrows(): + for idx, row in dflinks.fillna('').iterrows(): outformat['edges'].append( { 'source': row['source'], @@ -254,5 +257,5 @@ def generateNetworkFiles(self, outfolder): outfile = os.path.join(outfolder, doiname + '.json') with open(outfile, 'w', encoding="utf8") as ofile: - json.dump(outformat, ofile, ensure_ascii=False) + json.dump(outformat, ofile, indent=4, ensure_ascii=True) return f'Finished querying extra metadata in {time.time() - starttime} seconds.' From 0c0419ffca29835ea690b0c3f4cdb4b9bd4ee959 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 21 Mar 2022 13:37:07 +0100 Subject: [PATCH 46/53] add first author name to filename --- src/semanticlayertools/visual/citationnet.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py index f72745d..69f3a9c 100644 --- a/src/semanticlayertools/visual/citationnet.py +++ b/src/semanticlayertools/visual/citationnet.py @@ -33,6 +33,7 @@ def __init__(self, verbose: bool = False, api_key=""): self.dsl: dimcli.Dsl = dimcli.Dsl() self._verbose = verbose self.startDoi: str = "" + self.firstAuthor: str = "NoAuthor" self.citationLimit: int = 100 self.dataframeList = [] @@ -145,6 +146,7 @@ def _getMissing(self, idlist): def query(self, startDoi=''): self.startDoi = startDoi + self.dataframeList = [] starttime = time.time() doi2id = self.dsl.query( f"""search @@ -152,11 +154,15 @@ def query(self, startDoi=''): where doi = "{startDoi}" and times_cited <= {self.citationLimit} return - publications[id+doi+times_cited+category_for+title+year+reference_ids] + publications[id+authors+doi+times_cited+category_for+title+year+reference_ids] """, verbose=self._verbose ) querydf = doi2id.as_dataframe() + try: + self.firstAuthor = doi2id.as_dataframe_authors().last_name.iloc[0] + except Exception: + raise if querydf.shape[0] > 0: self.pubids = querydf['id'].values[0] self.pubrefs = list( @@ -252,10 +258,12 @@ def generateNetworkFiles(self, outfolder): } ) doiname = self.startDoi + firstauthor = self.firstAuthor for key, val in self.stringClean.items(): doiname = re.sub(key, val, doiname) + firstauthor = re.sub(key, val, firstauthor) - outfile = os.path.join(outfolder, doiname + '.json') + outfile = os.path.join(outfolder, firstauthor + '_' + doiname + '.json') with open(outfile, 'w', encoding="utf8") as ofile: - json.dump(outformat, ofile, indent=4, ensure_ascii=True) + json.dump(outformat, ofile, ensure_ascii=True) return f'Finished querying extra metadata in {time.time() - starttime} seconds.' From 1df9645daf939089f3b010fcce3e5f79389a76cc Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Mon, 21 Mar 2022 16:09:40 +0100 Subject: [PATCH 47/53] wip add time,filename output --- src/semanticlayertools/visual/citationnet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py index 69f3a9c..73253a0 100644 --- a/src/semanticlayertools/visual/citationnet.py +++ b/src/semanticlayertools/visual/citationnet.py @@ -263,7 +263,7 @@ def generateNetworkFiles(self, outfolder): doiname = re.sub(key, val, doiname) firstauthor = re.sub(key, val, firstauthor) - outfile = os.path.join(outfolder, firstauthor + '_' + doiname + '.json') + outfile = os.path.join(outfolder, f'{firstauthor}_{doiname}.json') with open(outfile, 'w', encoding="utf8") as ofile: json.dump(outformat, ofile, ensure_ascii=True) - return f'Finished querying extra metadata in {time.time() - starttime} seconds.' + return {time.time() - starttime}, f'{firstauthor}_{doiname}.json' From 8ddb6ac17fdea2030a3a208cecda5c1bb58b6107 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Tue, 22 Mar 2022 12:34:17 +0100 Subject: [PATCH 48/53] catch exception of no author or empty df --- src/semanticlayertools/visual/citationnet.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py index 73253a0..c5ff17c 100644 --- a/src/semanticlayertools/visual/citationnet.py +++ b/src/semanticlayertools/visual/citationnet.py @@ -160,9 +160,9 @@ def query(self, startDoi=''): ) querydf = doi2id.as_dataframe() try: - self.firstAuthor = doi2id.as_dataframe_authors().last_name.iloc[0] - except Exception: - raise + self.firstAuthor = doi2id.as_dataframe_authors()['last_name'].iloc[0] + except KeyError: + pass if querydf.shape[0] > 0: self.pubids = querydf['id'].values[0] self.pubrefs = list( @@ -212,7 +212,7 @@ def query(self, startDoi=''): print(f'Finished queries in {time.time() - starttime} seconds.') return self else: - print('The requested DOI is cited to often.') + print('The requested DOI is either cited to often or not available in the dataset.') def returnLinks(self): return pd.concat(self.dataframeList) From ab082024f1c3cdb24eac311658525f48f2a7229c Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Tue, 22 Mar 2022 12:36:54 +0100 Subject: [PATCH 49/53] chg return type --- src/semanticlayertools/visual/citationnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py index c5ff17c..4ab4a1c 100644 --- a/src/semanticlayertools/visual/citationnet.py +++ b/src/semanticlayertools/visual/citationnet.py @@ -212,7 +212,7 @@ def query(self, startDoi=''): print(f'Finished queries in {time.time() - starttime} seconds.') return self else: - print('The requested DOI is either cited to often or not available in the dataset.') + return f'The requested DOI {startDoi} is either cited to often or not available in the dataset.' def returnLinks(self): return pd.concat(self.dataframeList) From 15d96666bf23914cd0caa3688c68e77811b77229 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Tue, 22 Mar 2022 13:14:27 +0100 Subject: [PATCH 50/53] add option for citationlimit --- src/semanticlayertools/visual/citationnet.py | 3 +- .../visual/generateCitationTree.py | 705 ------------------ 2 files changed, 2 insertions(+), 706 deletions(-) delete mode 100644 src/semanticlayertools/visual/generateCitationTree.py diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py index 4ab4a1c..edf0aaf 100644 --- a/src/semanticlayertools/visual/citationnet.py +++ b/src/semanticlayertools/visual/citationnet.py @@ -144,8 +144,9 @@ def _getMissing(self, idlist): retDF.insert(0, 'title', cleantitle) return retDF[retCols] - def query(self, startDoi=''): + def query(self, startDoi='', citationLimit=100): self.startDoi = startDoi + self.citationLimit = citationLimit self.dataframeList = [] starttime = time.time() doi2id = self.dsl.query( diff --git a/src/semanticlayertools/visual/generateCitationTree.py b/src/semanticlayertools/visual/generateCitationTree.py deleted file mode 100644 index 69c7b04..0000000 --- a/src/semanticlayertools/visual/generateCitationTree.py +++ /dev/null @@ -1,705 +0,0 @@ -""" -generateCitationTree contains classes for generating citation trees for a single DOI. - -https://app.dimensions.ai is used in favor of CrossRef, as it contains richer information. -Requires dimensions.ai API access. -Results are compatible to the existing GenerateCitationNetwork module. -""" -import os -import dimcli -import pandas as pd -import numpy as np -import json -from tqdm import tqdm -import math -from typing import Dict, Tuple, List -import datetime -import time -import re -from requests.exceptions import HTTPError - -# type aliases -Doi = str -PubID = str -FilePath = str - - -class generateTree: - """GenerateCitationNet makes citation/reference networks for a document. - - For a given input document, its references and citations are evaluated. In - a second step, citations of citations and references of references are extracted. - This information is used to generate a tree like network. - """ - - def __init__( - self, - verbose: bool = False, - api_key="", - use_expanded_target_references: bool = False, - ): - """ - __init__ instantiates citation network generator. - - :param verbose: forwarded to dimcli queries, defaults to False - :type verbose: bool, optional - :param api_key: dimensions.ai API key, tries to use dsl.ini if not existent, defaults to "" - :type api_key: str, optional - :param use_expanded_target_references: whether or not to use indirect connections - (not through input node) to make network edges - :type use_expanded_target_references: bool, optional - """ - while not dimcli.login_status(): - try: - dimcli.login(key=api_key) - except HTTPError as e: - if e.response.status_code == 401: - raise - time.sleep(5) - pass - - self.dsl: dimcli.Dsl = dimcli.Dsl() - self._verbose: bool = verbose - self.startDoi: Doi = "" - self.stringClean = {r"\s": "__", "/": "_slash_", ":": "_colon_", r"\.": "_dot_"} - self._make_hairball = use_expanded_target_references - - def fetchPubsByIDs( - self, pubIDs: List[PubID], authors: bool = True - ) -> Tuple[bool, pd.DataFrame]: - """ - Fetch publications from dimcli using PubIDs defined by dimensions.ai. - - :param pubIDs: list of PubIDs (string type alias) - :type pubIDs: List[PubID] - :param authors: whether to fetch author information, defaults to True - :type authors: bool, optional - :return: status-bool (True if everything is okay), dataframe containing - required information for input publications - :rtype: Tuple[bool, pd.DataFrame] - """ - if self._verbose: - print(f"hi, this is fetchPubsByIDs() for pubs = {pubIDs}") - - if authors: - query = f""" - search publications - where id in {json.dumps(pubIDs)} - return publications[id+doi+title+category_for+year - +reference_ids+authors+journal_title_raw+times_cited] - limit {len(pubIDs)} - """ - else: - query = f""" - search publications - where id in {json.dumps(pubIDs)} - return publications[id+doi+title+category_for+year - +reference_ids+journal_title_raw+times_cited] - limit {len(pubIDs)} - """ - - dsl_data = self.dsl.query(query, verbose=self._verbose) - - df = dsl_data.as_dataframe() - - try: - df["target_refs"] = df["reference_ids"] - except (TypeError, KeyError): - return False, pd.DataFrame() - - # replace NaN with empty list - df["target_refs"] = df["target_refs"].apply( - lambda target_ref: [] if type(target_ref) == float else target_ref - ) - - if not authors: - df["authors"] = [np.nan] * len(df) - - return True, df - - def fetchPubsByDois( - self, dois: List[Doi], authors: bool = True - ) -> Tuple[bool, pd.DataFrame]: - """ - Fetch publications from dimcli using DOIs. - - :param dois: list of DOIs (string type alias) - :type dois: List[Doi] - :param authors: whether to fetch author information, defaults to True - :type authors: bool, optional - :return: status-bool (True if everything is okay), dataframe containing - required information for input publications - :rtype: Tuple[bool, pd.DataFrame] - """ - if self._verbose: - print(f"hi, this is fetchOrigin() for doi = {dois}") - - if authors: - query = f""" - search publications - where doi in {json.dumps(dois)} - return publications[id+doi+title+category_for+year - +reference_ids+authors+journal_title_raw+times_cited] - """ - else: - query = f""" - search publications - where doi in {json.dumps(dois)} - return publications[id+doi+title+category_for+year - +reference_ids+journal_title_raw+times_cited] - """ - - dsl_data = self.dsl.query(query, verbose=self._verbose) - - df = dsl_data.as_dataframe() - try: - df["target_refs"] = df["reference_ids"] - except (TypeError, KeyError): - return False, pd.DataFrame() - - # replace NaN with empty list - df["target_refs"] = df["target_refs"].apply( - lambda target_ref: [] if type(target_ref) == float else target_ref - ) - - if not authors: - df["authors"] = [np.nan] * len(df) - - return True, df - - def fetchCitations( - self, pubIDs: List[PubID], authors: bool = False - ) -> Tuple[bool, pd.DataFrame]: - """ - Fetch citing publications for a list of publications using their PubIDs. - - :param pubIDs: list of PubIDs (string type alias) - :type pubIDs: List[PubID] - :param authors: whether to fetch author information, defaults to False - :type authors: bool, optional - :return: status-bool (True if everything is okay), dataframe containing - required information for citing publications - :rtype: Tuple[bool, pd.DataFrame] - """ - if self._verbose: - print(f"hi, this is fetchCitations() for pubs = {pubIDs}") - - dfs = [] - if math.ceil(len(pubIDs) / 512) > 1: - __range__ = tqdm(range(math.ceil(len(pubIDs) / 512))) - else: - __range__ = range(math.ceil(len(pubIDs) / 512)) - - for i in __range__: - # dimcli queries are limited to 512 entites per list for `in` filtering - offset = i * 512 - - if authors: - query = f""" - search publications - where reference_ids in {json.dumps(pubIDs[offset:offset+512])} - return publications[id+doi+title+category_for+year - +reference_ids+authors+journal_title_raw+times_cited] - """ - else: - query = f""" - search publications - where reference_ids in {json.dumps(pubIDs[offset:offset+512])} - return publications[id+doi+title+category_for+year - +reference_ids+journal_title_raw+times_cited] - """ - - dsl_data = self.dsl.query_iterative(query, verbose=self._verbose) - tmp = dsl_data.as_dataframe() - - try: - _ = tmp["reference_ids"] - except (TypeError, KeyError): - return False, pd.DataFrame() - - dfs.append(tmp) - - df = pd.concat(dfs) - # intersection of input pubIDs and references of each publication - df["target_refs"] = df["reference_ids"].apply( - lambda row_refs: list(set(pubIDs) & set(row_refs)) - ) - - if not authors: - df["authors"] = [np.nan] * len(df) - - return True, df - - def fetchReferences( - self, pubIDs: List[PubID], authors: bool = True - ) -> Tuple[bool, pd.DataFrame]: - """ - Fetch references for a list of publications using their PubIDs as defined by dimensions.ai. - - :param pubIDs: list of PubIDs (string type alias) - :type pubIDs: List[PubID] - :param authors: whether to fetch author information, defaults to True - :type authors: bool, optional - :return: status-bool (True if everything is okay), dataframe containing - required information for references - :rtype: Tuple[bool, pd.DataFrame] - """ - if self._verbose: - print(f"hi, this is fetchReferences() for pubs = {pubIDs}") - - dfs = [] - - if math.ceil(len(pubIDs) / 512) > 1: - __range__ = tqdm(range(math.ceil(len(pubIDs) / 512))) - else: - __range__ = range(math.ceil(len(pubIDs) / 512)) - - # get references (PubID) of given PubIDs - for i in __range__: - # dimcli queries are limited to 512 entites per list for `in` filtering - offset = i * 512 - - query = f""" - search publications - where id in {json.dumps(pubIDs[offset:offset + 512])} - return publications[id+reference_ids] - limit 512 - """ - - dsl_data = self.dsl.query(query, verbose=self._verbose) - tmp = dsl_data.as_dataframe() - - try: - _ = tmp["reference_ids"] - except (TypeError, KeyError): - return False, pd.DataFrame() - - dfs.append(tmp) - - df0 = pd.concat(dfs) - - # flatten list of references - # List[List[PubID]] -> List[PubID] - refs = [x for x in df0["reference_ids"].dropna().to_list() for x in x] - - # drop duplicates - refs = list(set(refs)) - - dfs = [] - if math.ceil(len(refs) / 512) > 1: - __range__ = tqdm(range(math.ceil(len(refs) / 512))) - else: - __range__ = range(math.ceil(len(refs) / 512)) - - for i in __range__: - # dimcli queries are limited to 512 entites per list for `in` filtering - offset = i * 512 - ok, df = self.fetchPubsByIDs(refs[offset : offset + 512], authors=authors) - if ok: - dfs.append(df) - else: # pragma: no cover - # cannot be reached unless dimensions database is malicious - return False, pd.DataFrame() - - return True, pd.concat(dfs) - - def run( - self, doi: Doi, levels_ref: int = 2, levels_cite: int = 2, authors: bool = False - ) -> Tuple[bool, pd.DataFrame]: - """ - Generate citation network for a publication using its DOI. - - :param doi: input DOI (string type alias) - :type doi: Drunoi - :param levels_ref: number of levels for references, defaults to 2 - :type levels_ref: int, optional - :param levels_cite: number of levels for citing publications, defaults to 2 - :type levels_cite: int, optional - :param authors: whether to include author information, defaults to False - :type authors: bool, optional - :return: status-bool (True if everything is okay), dataframe containing - required information for input publications, references and citing publications - :rtype: Tuple[bool, pd.DataFrame] - """ - if hasattr(self, "result_df"): - return True, self.result_df - - self.startDoi = doi - - dfs = [] - - print("level 0") - ok, df_origin = self.fetchPubsByDois([doi], authors) - if not ok: - print(f"could not fetch publication for DOI {doi}") - return False, pd.DataFrame() - - df_origin["level"] = 0 - dfs.append(df_origin) - - ok, dfs_cite = self._fetchCite(df_origin, levels_cite, authors) - ok, dfs_ref = self._fetchRef(df_origin, levels_ref, authors) - - dfs.extend(dfs_cite + dfs_ref) - - self.result_df: pd.DataFrame = pd.concat(dfs).reset_index(drop=True) - - # cleaning - # self.result_df = self.dropDuplicates(self.result_df) - self.result_df["first_author"] = self.result_df["authors"].apply( - lambda authors: authors[0]["last_name"] if type(authors) == list else "" - ) - self.result_df["ref_count"] = self.result_df["reference_ids"].apply( - lambda refs: len(refs) if type(refs) == list else None - ) - self.result_df.index = self.result_df["id"] - - self.main_node = df_origin.iloc[0].copy() - if type(self.main_node["authors"]) == list: - self.main_node["first_author"] = self.main_node["authors"][0]["last_name"] - else: - self.main_node["first_author"] = "" - - self.result_df["main_category_for"] = self.result_df["category_for"].apply( - lambda c: [ - x["name"] - for x in filter(lambda dict_: re.match(r"^\d\d\s", dict_["name"]), c) - ][0] - if type(c) == list - else "" - ) - - # replace NaN in reference_ids with empty list - self.result_df["reference_ids"] = self.result_df["reference_ids"].apply( - lambda target_ref: [] if type(target_ref) == float else target_ref - ) - - # include expanded target refs - # (intersection of references of a paper and all listed publications, - # e.g. main_node cites A, Y cites main_node and A - # -> target_refs does not contain connection from Y to A) - all_pubs = set(self.result_df.index) - self.result_df["expanded_target_refs"] = self.result_df["reference_ids"].apply( - lambda reference_ids: list(all_pubs.intersection(reference_ids)) - ) - - return True, self.result_df - - def _fetchCite( - self, df_origin: pd.DataFrame, levels: int, authors: bool - ) -> Tuple[bool, List[pd.DataFrame]]: - dfs_cite = [] - - pubIDs = df_origin["id"].to_list() - for i in range(levels): - print(f"level {i + 1}, fetching citations for {len(pubIDs)} publications") - ok, tmp = self.fetchCitations(pubIDs, authors) - if ok: - pubIDs = tmp["id"].to_list() - tmp["level"] = i + 1 - dfs_cite.append(tmp) - else: # pragma: no cover - # cannot be reached unless dimensions database is malicious - return False, pd.DataFrame() - return True, dfs_cite - - def _fetchRef( - self, df_origin: pd.DataFrame, levels: int, authors: bool - ) -> Tuple[bool, List[pd.DataFrame]]: - dfs_ref = [] - pubIDs = df_origin["id"].to_list() - for i in range(levels): - print( - f"level {(i + 1) * (-1)}, fetching references for {len(pubIDs)} publications" - ) - ok, tmp = self.fetchReferences(pubIDs, authors) - if ok: - pubIDs = tmp["id"].to_list() - tmp["level"] = (i + 1) * (-1) - dfs_ref.append(tmp) - else: # pragma: no cover - # cannot be reached unless dimensions database is malicious - return False, pd.DataFrame() - return True, dfs_ref - - def _makeCompatibleRefDf( - self, df: pd.DataFrame, use_expanded: bool = False - ) -> pd.DataFrame: - """ - Reformat references dataframe to match prior versions formatting. - - :param df: dataframe as generated by .fetchReferences() - :type df: pd.DataFrame - :return: compatible dataframe - :rtype: pd.DataFrame - """ - levels_ref = min(df["level"]) - - # flatten references - if use_expanded: - target_ref_type = "expanded_target_refs" - else: - target_ref_type = "target_refs" - - ref_tuples = { - (row["id"], ref) - for _, row in df.query(f"{levels_ref} < level <= 0").iterrows() - for ref in row[target_ref_type] - } - - df = df[~df.index.duplicated(keep="first")] - - refs = [] - for (source_id, target_id) in ref_tuples: - source = df.loc[source_id] - target = df.loc[target_id] - - refs.append( - { - "type": "reference", - "sourceYear": source["year"], - "sourceDOI": source["doi"], - "sourcePubID": source["id"], - "sourceJournal": source["journal_title_raw"], - "targetFull": "", - "targetYear": target["year"], - "targetDOI": target["doi"], - "targetPubID": target["id"], - "targetrefCount": target["ref_count"], - "targetis_ref_byCount": target["times_cited"], - "targettitleStr": target["title"], - "targetFirstAuthor": target["first_author"], - "targetJournal": target["journal_title_raw"], - "targetSubject": target["category_for"], - } - ) - - return pd.DataFrame(refs) - - def _makeCompatibleCiteDf( - self, df: pd.DataFrame, use_expanded: bool = False - ) -> pd.DataFrame: - """ - Reformat citation dataframe to match prior versions formatting. - - :param df: dataframe as generated by .fetchCitations() - :type df: pd.DataFrame - :return: compatible dataframe - :rtype: pd.DataFrame - """ - levels_cite = max(df["level"]) - - # flatten citations - if use_expanded: - target_ref_type = "expanded_target_refs" - else: - target_ref_type = "target_refs" - - cite_tuples = { - (row["id"], ref) - for _, row in df.query(f"{levels_cite} >= level > 0").iterrows() - for ref in row[target_ref_type] - } - - df = df[~df.index.duplicated(keep="first")] - - cites = [] - for (source_id, target_id) in cite_tuples: - source = df.loc[source_id] - target = df.loc[target_id] - - cites.append( - { - "type": "citation", - "targetPubID": target["id"], - "targetYear": target["year"], - "targetDOI": target["doi"], - "targetJournal": target["journal_title_raw"], - "sourceYear": source["year"], - "sourceDOI": source["doi"], - "sourcePubID": source["id"], - "sourcerefCount": source["ref_count"], - "sourceis_ref_byCount": source["times_cited"], - "sourcetitleStr": source["title"], - "sourceFirstAuthor": source["first_author"], - "sourceJournal": source["journal_title_raw"], - "sourceSubject": source["category_for"], - } - ) - - return pd.DataFrame(cites) - - def makeCompatibleDf(self) -> Tuple[bool, pd.DataFrame]: - """ - Reformat dataframe to match prior versions formatting. - - :param df: dataframe as generated by .run() - :type df: pd.DataFrame - :return: compatible dataframe - :rtype: pd.DataFrame - """ - if not hasattr(self, "result_df"): - print("you gotta run .run() first") - return False, pd.DataFrame() - - if hasattr(self, "compatible_result_df"): - return True, self.compatible_result_df - - df_ref = self._makeCompatibleRefDf(self.result_df, self._make_hairball) - df_cite = self._makeCompatibleCiteDf(self.result_df, self._make_hairball) - - self.compatible_result_df: pd.DataFrame = pd.concat( - [df_cite, df_ref], ignore_index=True - ).fillna("") - - return True, self.compatible_result_df - - def runCompatible( - self, - doi: Doi, - level: int = 2, - direct: str = "both", - debug: bool = False, - ) -> Tuple[bool, pd.DataFrame]: - """ - Wrap .run() with same parameters and outputs as prior versions. - - :param doi: input DOI (string type alias) - :type doi: Doi - :param level: number of levels to fetch, defaults to 2 - :type level: int, optional - :param direct: direction of search (either "ref", "cite" or "both"), defaults to "both" - :type direct: str, optional - :param debug: [description], defaults to False - :type debug: bool, optional - :return: [description] - :rtype: Tuple[bool, pd.DataFrame] - """ - if direct == "ref": - ok, df = self.run(doi, levels_ref=level, levels_cite=0) - elif direct == "cite": - ok, df = self.run(doi, levels_ref=0, levels_cite=level) - elif direct == "both": - ok, df = self.run(doi, levels_ref=level, levels_cite=level) - else: - print("provide proper direction of search (either `ref`, `cite` or `both`)") - return False, pd.DataFrame() - - if not ok: - return False, pd.DataFrame() - - ok, comp_df = self.makeCompatibleDf() - if ok: - return True, comp_df - else: - return False, pd.DataFrame() - - def _nodeDict(self, row: pd.Series) -> Dict: - # row = row.fillna("") - - if row["doi"].lower() == self.startDoi.lower(): - inputDOI = "True" - else: - inputDOI = "False" - res = { - # "label": nodeName, - # "x": 0, - # "y": 0, - "id": row["id"], - "attributes": { - # "name": nodeName, - "title": row["title"], - "doi": row["doi"], - "nodeyear": row["year"], - "ref-by-count": row["times_cited"], - "is_input_DOI": inputDOI, - "category_for": row["main_category_for"], - "level": row["level"], - }, - # "color": "rgb(0,0,0)", - # "size": 10 - } - return res - - def _edgeDict(self, row: pd.Series) -> Dict: - # row = row.fillna("") - - res = { - "source": row["sourcePubID"], - "target": row["targetPubID"], - # "id": idx, - "attributes": {"year": row["sourceYear"], "type": row["type"]}, - # "color": "rgb(0,0,0)", - # "size": 1 - } - return res - - def _createFilename(self, ext: str = "json") -> FilePath: - filename = self.startDoi - date = datetime.datetime.now().strftime("%Y-%m-%d") - for key, val in self.stringClean.items(): - filename = re.sub(key, val, filename) - if self._make_hairball: - path = f"{self.main_node['first_author']}_{filename}_date_{date}_hairball.{ext}" - else: - path = f"{self.main_node['first_author']}_{filename}_date_{date}.{ext}" - return path - - def createJSON(self, outputPath: FilePath = "./out") -> Tuple[bool, FilePath]: - """ - Create JSON file on disk containing network as lists of nodes and edges for visualization. - - :param outputPath: output directory, defaults to "./out" - :type outputPath: FilePath, optional - :return: status-bool (True if everything is okay), path of JSON file - :rtype: Tuple[bool, FilePath] - """ - if not hasattr(self, "result_df"): - print("You need to use .run() first to create some data to write.") - return False, "" - - if not hasattr(self, "compatible_result_df"): - self.makeCompatibleDf() - - allNodes = [ - x - for _, x in self.result_df[~self.result_df.index.duplicated()] - .fillna("") - .iterrows() - ] - allRows = [x for x in self.compatible_result_df.fillna("").iterrows()] - - outputPath = os.path.abspath(outputPath) - if not os.path.exists(outputPath): - os.mkdir(outputPath) - - with open(f"{outputPath}/{self._createFilename()}", "w") as outFile: - # write nodes - outFile.write('{\n "nodes": [\n') - - # write nodes from compatible_result_df/allNodes - while allNodes: - node = allNodes.pop() - if len(allNodes) == 0: - outFile.write(json.dumps(self._nodeDict(node)) + "\n") - else: - outFile.write(json.dumps(self._nodeDict(node)) + ",\n") - - # write edges - outFile.write(' ],\n "edges":[') - while allRows: - idx, edge = allRows.pop() - if len(allRows) == 0: - outFile.write(json.dumps(self._edgeDict(edge)) + "\n") - else: - x = self._edgeDict(edge) - outFile.write(json.dumps(x) + ",\n") - outFile.write(" ]\n}") - - return True, f"{outputPath}/{self._createFilename()}" - - def logout(self) -> None: - """ - Dimcli logout. - """ - dimcli.logout() From 77c76a4a7e5f5e24c93f0002a370c4763f17b0cd Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Tue, 22 Mar 2022 13:36:59 +0100 Subject: [PATCH 51/53] return more informative feedback vals --- src/semanticlayertools/visual/citationnet.py | 103 ++++++++++--------- 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py index edf0aaf..dda4cd4 100644 --- a/src/semanticlayertools/visual/citationnet.py +++ b/src/semanticlayertools/visual/citationnet.py @@ -153,67 +153,68 @@ def query(self, startDoi='', citationLimit=100): f"""search publications where - doi = "{startDoi}" and times_cited <= {self.citationLimit} + doi = "{startDoi}" return publications[id+authors+doi+times_cited+category_for+title+year+reference_ids] """, verbose=self._verbose ) querydf = doi2id.as_dataframe() + if querydf.shape[0] == 0: + return f"The dataset contains no entry for {startDoi}." + elif querydf['times_cited'].iloc[0] >= self.citationLimit: + return f"{startDoi} is cited {querydf['times_cited'].iloc[0]} times. You can try to change the limit, if possible." try: self.firstAuthor = doi2id.as_dataframe_authors()['last_name'].iloc[0] except KeyError: pass - if querydf.shape[0] > 0: - self.pubids = querydf['id'].values[0] - self.pubrefs = list( - [x for y in querydf['reference_ids'].values for x in y] - ) - self.dataframeList.append( - self._editDF(querydf, dftype="ref_l1") - ) - ref1trgtList = list(self.dataframeList[0].target.values) - cit1df = self.dsl.query_iterative( - f"""search - publications - where - reference_ids = "{self.pubids}" - return - publications[id+doi+times_cited+category_for+title+year+reference_ids] - """, - verbose=self._verbose) - self.dataframeList.append( - self._editDF(cit1df.as_dataframe(), dftype='cite_l1') - ) - cit1SrcList = list(self.dataframeList[1].source.values) - cit2df = self.dsl.query_iterative( - f"""search - publications - where - reference_ids in {json.dumps(cit1SrcList)} - return - publications[id+doi+times_cited+category_for+title+year+reference_ids]""", - verbose=self._verbose - ) - self.dataframeList.append( - self._editDF(cit2df.as_dataframe(), dftype='cite_l2', level2List=cit1SrcList) - ) - ref2df = self.dsl.query_iterative( - f"""search - publications - where - id in {json.dumps(ref1trgtList)} - return - publications[id+doi+times_cited+category_for+title+year+reference_ids]""", - verbose=self._verbose - ) - self.dataframeList.append( - self._editDF(ref2df.as_dataframe(), dftype='ref_l2') - ) - print(f'Finished queries in {time.time() - starttime} seconds.') - return self - else: - return f'The requested DOI {startDoi} is either cited to often or not available in the dataset.' + self.pubids = querydf['id'].values[0] + self.pubrefs = list( + [x for y in querydf['reference_ids'].values for x in y] + ) + self.dataframeList.append( + self._editDF(querydf, dftype="ref_l1") + ) + ref1trgtList = list(self.dataframeList[0].target.values) + cit1df = self.dsl.query_iterative( + f"""search + publications + where + reference_ids = "{self.pubids}" + return + publications[id+doi+times_cited+category_for+title+year+reference_ids] + """, + verbose=self._verbose) + self.dataframeList.append( + self._editDF(cit1df.as_dataframe(), dftype='cite_l1') + ) + cit1SrcList = list(self.dataframeList[1].source.values) + cit2df = self.dsl.query_iterative( + f"""search + publications + where + reference_ids in {json.dumps(cit1SrcList)} + return + publications[id+doi+times_cited+category_for+title+year+reference_ids]""", + verbose=self._verbose + ) + self.dataframeList.append( + self._editDF(cit2df.as_dataframe(), dftype='cite_l2', level2List=cit1SrcList) + ) + ref2df = self.dsl.query_iterative( + f"""search + publications + where + id in {json.dumps(ref1trgtList)} + return + publications[id+doi+times_cited+category_for+title+year+reference_ids]""", + verbose=self._verbose + ) + self.dataframeList.append( + self._editDF(ref2df.as_dataframe(), dftype='ref_l2') + ) + print(f'Finished queries in {time.time() - starttime} seconds.') + return self def returnLinks(self): return pd.concat(self.dataframeList) From f940b72e60feeedb183b682f9867c5af46879e25 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Tue, 22 Mar 2022 14:48:21 +0100 Subject: [PATCH 52/53] chg output format of duration --- src/semanticlayertools/visual/citationnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py index dda4cd4..61134be 100644 --- a/src/semanticlayertools/visual/citationnet.py +++ b/src/semanticlayertools/visual/citationnet.py @@ -268,4 +268,4 @@ def generateNetworkFiles(self, outfolder): outfile = os.path.join(outfolder, f'{firstauthor}_{doiname}.json') with open(outfile, 'w', encoding="utf8") as ofile: json.dump(outformat, ofile, ensure_ascii=True) - return {time.time() - starttime}, f'{firstauthor}_{doiname}.json' + return time.time() - starttime, f'{firstauthor}_{doiname}.json' From 8c63199baab4f94045024bc3c9e83f15f0a0eed3 Mon Sep 17 00:00:00 2001 From: Malte Vogl Date: Thu, 24 Mar 2022 10:03:46 +0100 Subject: [PATCH 53/53] wip fix doc for visual --- docs/visual.rst | 3 ++- src/semanticlayertools/visual/citationnet.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/visual.rst b/docs/visual.rst index 09e7e89..8fba857 100644 --- a/docs/visual.rst +++ b/docs/visual.rst @@ -68,6 +68,7 @@ references as well as its citations and their citations. With this means, visualizations of it show academic roots and conduits and can display disciplinary pathways. -.. automodule:: semanticlayertools.visual.generateCitationTree +.. automodule:: semanticlayertools.visual.citationnet :members: + :private-members: :undoc-members: diff --git a/src/semanticlayertools/visual/citationnet.py b/src/semanticlayertools/visual/citationnet.py index 61134be..d0c7926 100644 --- a/src/semanticlayertools/visual/citationnet.py +++ b/src/semanticlayertools/visual/citationnet.py @@ -47,7 +47,7 @@ def __init__(self, verbose: bool = False, api_key=""): def _cleanTitleString(self, row): """Clean non-JSON characters from titles. - Removes newline characters and double backslashes. + Removes newline characters, double backslashes and quoted '"'. """ try: title = row @@ -217,9 +217,11 @@ def query(self, startDoi='', citationLimit=100): return self def returnLinks(self): + """Return all links as dataframe.""" return pd.concat(self.dataframeList) def generateNetworkFiles(self, outfolder): + """Generates JSON with nodes and edges lists.""" starttime = time.time() outformat = {'nodes': [], 'edges': []} dflinks = pd.concat(self.dataframeList)