From 3b3b27336dcbfebf79420401201ee1eab435f92e Mon Sep 17 00:00:00 2001 From: LeighWeston86 Date: Thu, 3 Jan 2019 16:36:28 -0800 Subject: [PATCH 01/19] add similar_materials to rester --- matscholar/rest.py | 13 +++++-------- matscholar/tests/test_rest.py | 17 ++++++++++++++++- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/matscholar/rest.py b/matscholar/rest.py index 11dc1bf..cd820a1 100644 --- a/matscholar/rest.py +++ b/matscholar/rest.py @@ -213,6 +213,11 @@ def get_summary(self, query): return self._make_request(sub_url, payload=payload, method=method) + def get_similar_materials(self, material): + method = "GET" + sub_url = '/materials/similar/{}'.format(material) + return self._make_request(sub_url, method=method) + class MatScholarRestError(Exception): """ @@ -222,11 +227,3 @@ class MatScholarRestError(Exception): pass -if __name__ == '__main__': - query = { - 'material' : ['GaN', '-InN'], - 'application' : ['LED'] - } - query = json.dumps(query) - rest = Rester() - print(rest.get_summary(query)) diff --git a/matscholar/tests/test_rest.py b/matscholar/tests/test_rest.py index a89b119..f41df19 100644 --- a/matscholar/tests/test_rest.py +++ b/matscholar/tests/test_rest.py @@ -182,4 +182,19 @@ def test_summary(self): result = self.rester.get_summary(self.test_query) self.assertEqual(result['MAT'][0][1], 738) subkeys = [key for key in self.KEYS if key != 'doi'] - self.assertTrue(all(key in result for key in subkeys)) \ No newline at end of file + self.assertTrue(all(key in result for key in subkeys)) + +class SimilarMaterialsTest(unittest.TestCase): + + rester = Rester() + + def test_similar_materials(self): + material = 'LiCoO2' + result = self.rester.get_similar_materials(material) + self.assertEqual(len(result), 10) + similar_mats = ['CoLi2NiO4', 'Co3Li10Ni7O20', 'CoLi4Ni3O8', 'CoLi3MnO5', 'CoLi2O4Si', + 'FeLiO2', 'CoLi3MnNiO6', 'CoLi10Ni9O20', 'CoLiMnO4', 'Fe2Li3O4P'] + self.assertEqual(result, similar_mats) + + + From 473a0a7d1e825edc306b79779e3014af5be9a252 Mon Sep 17 00:00:00 2001 From: Kevin Yang Date: Mon, 8 Apr 2019 13:54:08 -0700 Subject: [PATCH 02/19] changed rest.py to add journal resource --- matscholar/rest.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/matscholar/rest.py b/matscholar/rest.py index cd820a1..c4c89d7 100644 --- a/matscholar/rest.py +++ b/matscholar/rest.py @@ -200,6 +200,20 @@ def search_ents(self, query): return self._make_request(sub_url, payload=payload, method=method) + def get_journals(self, query): + ''' + + :param query: string: a paragraph + :return: list: [['journal name', 'cosine similarity'], ...] + ''' + + method = 'POST' + sub_url = '/journal_suggestion' + payload = {'abstract': query} + + return self._make_request(sub_url, payload=payload, method=method) + + def get_summary(self, query): ''' Get a summary of the entities associated with a given query From 76ba8c9d880438796b3d327b90493e81fc794a5c Mon Sep 17 00:00:00 2001 From: LeighWeston86 Date: Mon, 8 Apr 2019 15:27:53 -0700 Subject: [PATCH 03/19] added get_ner_tags to Rester --- matscholar/rest.py | 45 ++++++++++++++++++++++++++--------- matscholar/tests/test_rest.py | 31 ++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/matscholar/rest.py b/matscholar/rest.py index cd820a1..ae64da4 100644 --- a/matscholar/rest.py +++ b/matscholar/rest.py @@ -47,7 +47,7 @@ class Rester(object): def __init__(self, api_key=None, endpoint=None): self.api_key = api_key if api_key else environ['MATERIALS_SCHOLAR_API_KEY'] - self.preamble = endpoint if endpoint else environ['MATERIALS_SCHOLAR_ENDPOINT'] + self.preamble = endpoint if endpoint else "http://0.0.0.0:8080" #environ['MATERIALS_SCHOLAR_ENDPOINT'] self.session = requests.Session() self.session.headers = {"x-api-key": self.api_key} @@ -188,36 +188,61 @@ def materials_map(self, highlight, limit=None, ignore_missing=True, number_to_su return self._make_request(sub_url, payload=payload, method=method) def search_ents(self, query): - ''' + """ Get the entities in each document associated with a given query :param query: dict; e.g., {'material': ['GaN', '-InN']), 'application': ['LED']} :return: list of dicts; each dict represents a document and contains the extracted entities - ''' - method = 'POST' - sub_url = '/ent_search' + """ + + method = "POST" + sub_url = "/ent_search" payload = query return self._make_request(sub_url, payload=payload, method=method) def get_summary(self, query): - ''' + """ Get a summary of the entities associated with a given query :param query: dict; e.g., {'material': ['GaN', '-InN']), 'application': ['LED']} :return: dict; a summary dict with keys for each entity type - ''' - method = 'POST' - sub_url = '/ent_search/summary' + """ + + method = "POST" + sub_url = "/ent_search/summary" payload = query return self._make_request(sub_url, payload=payload, method=method) def get_similar_materials(self, material): + """ + Finds the most similar compositions in the corpus. + + :param material: string; a chemical composition + :return: list; the most similar compositions + """ method = "GET" sub_url = '/materials/similar/{}'.format(material) return self._make_request(sub_url, method=method) + def get_ner_tags(self, docs, return_type="iob"): + """ + Performs Named Entity Recognition. + + :param docs: list; a list of documents; each document is represented as a single string + :param return_type: string; output format, can be "iob", "concatenated", or "normalized" + :return: list; tagged documents + """ + + method = "POST" + sub_url = "/ner" + payload = { + "docs": docs, + "return_type": return_type + } + return self._make_request(sub_url, payload=payload, method=method) + class MatScholarRestError(Exception): """ @@ -225,5 +250,3 @@ class MatScholarRestError(Exception): Raised when the query has problems, e.g., bad query format. """ pass - - diff --git a/matscholar/tests/test_rest.py b/matscholar/tests/test_rest.py index f41df19..e9004d7 100644 --- a/matscholar/tests/test_rest.py +++ b/matscholar/tests/test_rest.py @@ -175,12 +175,12 @@ class EntSearchTest(unittest.TestCase): def test_ent_search(self): result = self.rester.search_ents(self.test_query) - self.assertEqual(len(result), 738) + self.assertEqual(len(result), 1126) self.assertTrue(all(key in result[0].keys() for key in self.KEYS)) def test_summary(self): result = self.rester.get_summary(self.test_query) - self.assertEqual(result['MAT'][0][1], 738) + self.assertEqual(result['MAT'][0][1], 1126) subkeys = [key for key in self.KEYS if key != 'doi'] self.assertTrue(all(key in result for key in subkeys)) @@ -196,5 +196,32 @@ def test_similar_materials(self): 'FeLiO2', 'CoLi3MnNiO6', 'CoLi10Ni9O20', 'CoLiMnO4', 'Fe2Li3O4P'] self.assertEqual(result, similar_mats) +class NERTest(unittest.TestCase): + + rester = Rester() + TEST_DOCS = ["We synthesized AO2 (A = Sr, Ba) thin films. The band gap was 2.5 eV.", + "The lattice constant of ZnO is 3.8 A. This was measured using XRD."] + + def test_iob(self): + tagged_docs = self.rester.get_ner_tags(self.TEST_DOCS, return_type="iob") + print(tagged_docs) + self.assertEqual(len(tagged_docs), 2) + self.assertEqual(len(tagged_docs[0]), 2) + self.assertEqual(tagged_docs[0][0][2][1], "B-MAT") + + def test_concatenated(self): + tagged_docs = self.rester.get_ner_tags(self.TEST_DOCS, return_type="concatenated") + self.assertEqual(len(tagged_docs), 2) + self.assertEqual(len(tagged_docs[0]), 2) + self.assertEqual(tagged_docs[0][0][2][1], "MAT") + self.assertEqual(tagged_docs[0][0][2][0], "AO2 ( A = Sr , Ba )") + self.assertFalse(any("-" in tag for token, tag in tagged_docs[0][0])) + + def test_normalized(self): + tagged_docs = self.rester.get_ner_tags(self.TEST_DOCS, return_type="normalized") + self.assertEqual(len(tagged_docs), 2) + self.assertEqual(len(tagged_docs[0]), 2) + self.assertEqual(tagged_docs[0][0][2][1], "MAT") + self.assertTrue(isinstance(tagged_docs[0][0][2][0], list)) From c70ea8956d6308c82e354ef10f9fef098189e640 Mon Sep 17 00:00:00 2001 From: LeighWeston86 Date: Fri, 12 Apr 2019 12:13:38 -0700 Subject: [PATCH 04/19] added materials_search_ents to Rester --- matscholar/rest.py | 28 ++++++++++++++++++++++++++-- matscholar/tests/test_rest.py | 17 +++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/matscholar/rest.py b/matscholar/rest.py index ae64da4..198bb83 100644 --- a/matscholar/rest.py +++ b/matscholar/rest.py @@ -47,7 +47,7 @@ class Rester(object): def __init__(self, api_key=None, endpoint=None): self.api_key = api_key if api_key else environ['MATERIALS_SCHOLAR_API_KEY'] - self.preamble = endpoint if endpoint else "http://0.0.0.0:8080" #environ['MATERIALS_SCHOLAR_ENDPOINT'] + self.preamble = endpoint if endpoint else environ['MATERIALS_SCHOLAR_ENDPOINT'] self.session = requests.Session() self.session.headers = {"x-api-key": self.api_key} @@ -226,7 +226,7 @@ def get_similar_materials(self, material): sub_url = '/materials/similar/{}'.format(material) return self._make_request(sub_url, method=method) - def get_ner_tags(self, docs, return_type="iob"): + def get_ner_tags(self, docs, return_type="concatenated"): """ Performs Named Entity Recognition. @@ -243,6 +243,30 @@ def get_ner_tags(self, docs, return_type="iob"): } return self._make_request(sub_url, payload=payload, method=method) + def materials_search_ents(self, entities, elements, cutoff=None): + """ + Finds materials that co-occur with specified entities. The returned materials can be screened + by specifying elements that must be included/excluded from the stoichiometry. + + :param entities: list of strings; each string is a property or application + :param elements: list of strings; each string is a chemical element. Materials + will only be returned if they contain these elements; the opposite can also be + achieved - materials can be removed from the returned list by placing a negative + sign in from of the element, e.g., "-Ti" + :param cutoff: int or None; if int, specifies the number of materials to + return; if None, returns all materials + :return: list; a list of chemical compositions + """ + + method = "POST" + sub_url = "/search/material_search" + payload = { + "entities": entities, + "elements": elements, + "cutoff": cutoff + } + return self._make_request(sub_url, payload=payload, method=method) + class MatScholarRestError(Exception): """ diff --git a/matscholar/tests/test_rest.py b/matscholar/tests/test_rest.py index e9004d7..8f7ad3d 100644 --- a/matscholar/tests/test_rest.py +++ b/matscholar/tests/test_rest.py @@ -224,4 +224,21 @@ def test_normalized(self): self.assertEqual(tagged_docs[0][0][2][1], "MAT") self.assertTrue(isinstance(tagged_docs[0][0][2][0], list)) +class MaterialSearchEntsTest(unittest.TestCase): + + rester = Rester() + TEST_QUERY = { + "entities": ["ferroelectric"], + "elements": ["O", "-Pb"], + "cutoff": None + } + + def test_materials_search(self): + result = self.rester.materials_search_ents(**self.TEST_QUERY) + self.assertEqual(result[0][0], "BaO3Ti") + self.assertTrue(not any("Pb" in mat for mat, _, _ in result)) + self.assertTrue(all("O" in mat for mat, _, _ in result)) + + + From 6cf8ef10e9b52a3ebd2180413f61d3a6f8c7526f Mon Sep 17 00:00:00 2001 From: Amalie Trewartha Date: Mon, 15 Apr 2019 13:42:59 -0700 Subject: [PATCH 05/19] Added search_text_with_ents API hook --- matscholar/rest.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/matscholar/rest.py b/matscholar/rest.py index 198bb83..eee6e4d 100644 --- a/matscholar/rest.py +++ b/matscholar/rest.py @@ -267,6 +267,27 @@ def materials_search_ents(self, entities, elements, cutoff=None): } return self._make_request(sub_url, payload=payload, method=method) + def search_text_with_ents(self, text, filters, cutoff=None): + """ + Search abstracts by text with filters for entities + + :param text: string; text to search + :param filters: dict; e.g., {'material': ['GaN', '-InN']), 'application': ['LED']} + :param cutoff: int or None; if int, specifies the number of matches to + return; if None, returns all matches + :return: list; a list of chemical compositions + """ + + method = "POST" + sub_url = "/search/" + query = {"query": filters, "limit": cutoff} + query['query']['text'] = text + payload = { + "query": query, + "cutoff": cutoff + } + return self._make_request(sub_url, payload=payload, method=method) + class MatScholarRestError(Exception): """ From 9206e1ac23b00e6f946eb54c33097473dd7df275 Mon Sep 17 00:00:00 2001 From: John Dagdelen Date: Mon, 20 May 2019 00:14:44 -0700 Subject: [PATCH 06/19] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 878569c..8491ecf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ monty pymatgen gensim chemdataextractor +pandas From 4c8bd011e6415a9b476d21e0eb75eeade30b8c58 Mon Sep 17 00:00:00 2001 From: jdagdelen Date: Tue, 21 May 2019 16:47:48 -0700 Subject: [PATCH 07/19] adding journal suggestion --- matscholar/rest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/matscholar/rest.py b/matscholar/rest.py index e02a7a6..26c54fd 100644 --- a/matscholar/rest.py +++ b/matscholar/rest.py @@ -66,6 +66,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): def _make_request(self, sub_url, payload=None, method="GET"): response = None url = self.preamble + sub_url + print(url) try: if method == "POST": response = self.session.post(url, json=payload, verify=True) From ae528599288ddc5375b9762c160cb800727162b1 Mon Sep 17 00:00:00 2001 From: Amalie Trewartha Date: Tue, 21 May 2019 18:45:33 -0700 Subject: [PATCH 08/19] Fixed text_search_with_ents method in rest.py --- matscholar/rest.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/matscholar/rest.py b/matscholar/rest.py index 26c54fd..137ed82 100644 --- a/matscholar/rest.py +++ b/matscholar/rest.py @@ -294,13 +294,13 @@ def search_text_with_ents(self, text, filters, cutoff=None): """ method = "POST" - sub_url = "/search/" - query = {"query": filters, "limit": cutoff} - query['query']['text'] = text + sub_url = "/search" + filters['text'] = text payload = { - "query": query, - "cutoff": cutoff + "query": filters, + "limit": cutoff } + return self._make_request(sub_url, payload=payload, method=method) From 8e25800a3c010f5fa5565e21801569734d05629f Mon Sep 17 00:00:00 2001 From: LeighWeston86 Date: Wed, 22 May 2019 14:00:41 -0700 Subject: [PATCH 09/19] add API documentation to README.md --- README.md | 139 +++++++++++++++++++++++++++++++++++++++++++-- matscholar/rest.py | 5 +- 2 files changed, 136 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 3e6e4f1..f42c9f4 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,154 @@ matscholar logo -`matscholar` (Materials Scholar) is a Python library for materials-focused natural language processing (NLP). It is maintained by a team of researchers at UC Berkeley and Lawrence Berkeley National Laboratory as part of a project funded by the Toyota Research Institute. +`matscholar` (Materials Scholar) is a Python library for materials-focused natural language +processing (NLP). It is maintained by a team of researchers at UC Berkeley and Lawrence Berkeley +National Laboratory as part of a project funded by the Toyota Research Institute. -This library provides a Python interface for interacting with the Materials Scholar API, performing basic NLP tasks on scientific text, and example notebooks on using these tools for materials discovery and design. +This library provides a Python interface for interacting with the Materials Scholar API, performing +basic NLP tasks on scientific text, and example notebooks on using these tools for materials +discovery and design. ## Setup -We *highly* recommend using a [conda environment](https://conda.io/docs/user-guide/tasks/manage-environments.html) when working with materials scholar tools. +We *highly* recommend using a [conda environment](https://conda.io/docs/user-guide/tasks/manage-environments.html) +when working with materials scholar tools. 1. Clone or download this repo 2. Navigate to the root directory (matscholar) 3. `pip install -r requirements.txt` -4. `pip install .` [or](https://stackoverflow.com/questions/15724093/difference-between-python-setup-py-install-and-pip-install) `python setup.py install` +4. `pip install .` [or](https://stackoverflow.com/questions/15724093/difference-between-python-setup-py-install-and-pip-install) +`python setup.py install` ## Configuring Your API Key The Materials Scholar API can only be accessed by providing an API key in `x-api-key` request header field. To receive an API key to access the Materials Scholar API, please contact John Dagdelen at jdagdelen@lbl.gov. -Once you have an API key, you can add it as an environment variable `MATSCHOLAR_API_KEY` for ease of use. +## API Usage + +For convenience, the Materials Scholar API can be accessed via a python wrapper. + +### Instantiating the Rester + +If an API key has already been obtained, the rester is instantiated as follows: + +```python +from matscholar.rest import Rester + +rester = Rester(api_key="your-api-key", endpoint="matscholar-endpoint") +``` + +To avoid passing the API key and endpoint as arguments, set the following environment variables +for ease of use: `MATSCHOLAR_API_KEY`, `MATERIALS_SCHOLAR_ENDPOINT`. + +### Resources + +The methods of the Rester class can be used to access resources of the Materials Scholar API. + +**Searching documents** + +Our corpus of materials science abstracts can be searched based on text matching +(ElasticSearch) or by filtering based on the Named Entities extracted from each document. +Entity based searches support the following entity types: material, property, application, +descriptor, characterization, synthesis, phase. + +To get the raw text of abstracts matching a given query: + +```python +example_text = "solid oxide fuel cells" +example_entities = {"material": ["BaZrO3"], "descriptor": ["nanoparticle", "-thin film"]} +docs = rester.search_text_with_ents(text=example_entities, filters=example_entities) +``` + +This will return a list of dictionaries containing the raw-text for each abstracts along with +associated metadata. + +**Searching entities** +We have extracted materials-science named entities form nearly 3.5 million materials science +absracts. Details on how this was performed can be found here: Reference coming soon... + +The extracted named entities for each document associated with a query are returned by the +search_ents method. This method takes a dictionary with entity types as keys and a list of entities + for values as input. For example, to find all of the entities that co-occur with the material +"GaN": + +```python +docs = rester.search_text_with_ents(query={"material": ["GaN"]}) +``` + +This wil return a list of dictionaries representing documents matching the query; each dict will contain +the DOI as well as each unique entity found in the corresponding abstract. + +A summary of the entities associated with a query can be generated using the get_summary method. To get +statistics for entities co-occuring with GaN, + +```python +summary = rester.get_summary(query={"material": ["GaN"]}) +``` + This will return a dictionary with entity types as keys; the values will be a list of the top entities + that occur in documents matching the query, each item in will be (entity, document count, fraction). + +To perform a fast literature review, the materials_search_ents method may be used. For a chosen application, +this will return a list of all materials that co-occur with that application in our corpus. For example, +to see which materials co-occur with the word thermoelectric in a document, + +```python +mat_list = rester.materials_search_ents(["thermoelectric"], elements=["-Pb"], cutoff=None) +``` + +The above search will find all materials that that do not contain lead. The result will be a list, with each +element containing a list of [material, co-occurence counts, co-occurrence dois]. + +**Word embeddings** +Materials science word embeddings trained using word2vec; details on how the embeddings were trained, +and their application in materials science discover can be found here: Tshitoyan et al., Nature, (accepted, +reference coming soon). + +To get the word embedding for a given word, +```python +embeddings = rester.get_embeddings("photovoltaics") +``` + +This will return a dict containing the embedding. The word embedding will be a 200-dimensional array. + +The rester also has a close_words method (based on cosine similarity of embeddings) which can be used to +explore the semantic similarity of materials science terms; this approach can be used discover materials, +for a new application (as outlined in the reference above), + +To find words with a similar embedding to photovolatic: + +```python +close_words = rester.close_words("photovoltaics", top_k=1000) +``` + +This will return the 1000 closest words to photovoltaics. The result will be a dictionary containing +the close words and their cosine similarity to the input word. + +**Named Entity Recognition** +In addition to the pre-processed entities present in our corpus, users can performed Named Entity +Recognition on any raw materials science text. The details of the model can be found here: + +The input should be a list of documents with the text represented as a string. + +```python +doc_1 = "The bands gap of TiO2 is 3.2 eV. This was measured via photoluminescence" +doc_2 = "We deposit GaN thin films using MOCVD" +docs = [doc_1, doc_2] +tagged_docs = rester.get_ner_tags(docs, return_type="concatenated") +``` + +The arguement return_type may be set to iob, concatenated, or normalized. The latter will replace +entities with their most frequently occurring synonym. A list of tagged documents will be returned. +Each doc is a list of sentences; each sentence is a list of (word, tag) pairs. + +## Citation +If you use any of the API functionality in your research, please consider citing the following papers +where relevent: + +[1] Tshitoyan et al., Nature (accepted) +[2] Weston et al., coming soon + ## Contributors @jdagdelen, @vtshitoyan, @lweston diff --git a/matscholar/rest.py b/matscholar/rest.py index 137ed82..1790d1c 100644 --- a/matscholar/rest.py +++ b/matscholar/rest.py @@ -47,7 +47,7 @@ class Rester(object): def __init__(self, api_key=None, endpoint=None): self.api_key = api_key if api_key else environ['MATERIALS_SCHOLAR_API_KEY'] - self.preamble = endpoint if endpoint else environ['MATERIALS_SCHOLAR_ENDPOINT'] + self.preamble = endpoint if endpoint else "http://0.0.0.0:8080" #environ['MATERIALS_SCHOLAR_ENDPOINT'] self.session = requests.Session() self.session.headers = {"x-api-key": self.api_key} @@ -285,7 +285,6 @@ def materials_search_ents(self, entities, elements, cutoff=None): def search_text_with_ents(self, text, filters, cutoff=None): """ Search abstracts by text with filters for entities - :param text: string; text to search :param filters: dict; e.g., {'material': ['GaN', '-InN']), 'application': ['LED']} :param cutoff: int or None; if int, specifies the number of matches to @@ -300,7 +299,7 @@ def search_text_with_ents(self, text, filters, cutoff=None): "query": filters, "limit": cutoff } - + return self._make_request(sub_url, payload=payload, method=method) From dfc373dab9f1cc6a0fee83ab4aab495c3b19b211 Mon Sep 17 00:00:00 2001 From: Leigh Weston Date: Wed, 22 May 2019 14:01:50 -0700 Subject: [PATCH 10/19] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index f42c9f4..2fa1eb1 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ This will return a list of dictionaries containing the raw-text for each abstrac associated metadata. **Searching entities** + We have extracted materials-science named entities form nearly 3.5 million materials science absracts. Details on how this was performed can be found here: Reference coming soon... @@ -101,6 +102,7 @@ The above search will find all materials that that do not contain lead. The resu element containing a list of [material, co-occurence counts, co-occurrence dois]. **Word embeddings** + Materials science word embeddings trained using word2vec; details on how the embeddings were trained, and their application in materials science discover can be found here: Tshitoyan et al., Nature, (accepted, reference coming soon). @@ -126,6 +128,7 @@ This will return the 1000 closest words to photovoltaics. The result will be a d the close words and their cosine similarity to the input word. **Named Entity Recognition** + In addition to the pre-processed entities present in our corpus, users can performed Named Entity Recognition on any raw materials science text. The details of the model can be found here: @@ -143,10 +146,12 @@ entities with their most frequently occurring synonym. A list of tagged documen Each doc is a list of sentences; each sentence is a list of (word, tag) pairs. ## Citation + If you use any of the API functionality in your research, please consider citing the following papers where relevent: [1] Tshitoyan et al., Nature (accepted) + [2] Weston et al., coming soon From 618ae08a6cfe665705cfacb1e89a8713b91f33b5 Mon Sep 17 00:00:00 2001 From: Leigh Weston Date: Wed, 22 May 2019 14:21:24 -0700 Subject: [PATCH 11/19] formatting for README.md --- README.md | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 2fa1eb1..8a51c30 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ If an API key has already been obtained, the rester is instantiated as follows: ```python from matscholar.rest import Rester -rester = Rester(api_key="your-api-key", endpoint="matscholar-endpoint") +rester = Rester(api_key="your-api-key", endpoint="api.matscholar.com") ``` To avoid passing the API key and endpoint as arguments, set the following environment variables @@ -66,16 +66,16 @@ associated metadata. **Searching entities** -We have extracted materials-science named entities form nearly 3.5 million materials science -absracts. Details on how this was performed can be found here: Reference coming soon... +We have extracted materials-science named entities from nearly 3.5 million materials science +absracts. Details on how this was performed can be found in Ref. [1]. The extracted named entities for each document associated with a query are returned by the -search_ents method. This method takes a dictionary with entity types as keys and a list of entities - for values as input. For example, to find all of the entities that co-occur with the material +search_ents method. This method takes as input a dictionary with entity types as keys and a list of entities + as values. For example, to find all of the entities that co-occur with the material "GaN": ```python -docs = rester.search_text_with_ents(query={"material": ["GaN"]}) +docs = rester.search_ents(query={"material": ["GaN"]}) ``` This wil return a list of dictionaries representing documents matching the query; each dict will contain @@ -88,7 +88,7 @@ statistics for entities co-occuring with GaN, summary = rester.get_summary(query={"material": ["GaN"]}) ``` This will return a dictionary with entity types as keys; the values will be a list of the top entities - that occur in documents matching the query, each item in will be (entity, document count, fraction). + that occur in documents matching the query, each item in the list will be [entity, document count, fraction]. To perform a fast literature review, the materials_search_ents method may be used. For a chosen application, this will return a list of all materials that co-occur with that application in our corpus. For example, @@ -98,14 +98,13 @@ to see which materials co-occur with the word thermoelectric in a document, mat_list = rester.materials_search_ents(["thermoelectric"], elements=["-Pb"], cutoff=None) ``` -The above search will find all materials that that do not contain lead. The result will be a list, with each -element containing a list of [material, co-occurence counts, co-occurrence dois]. +The above search will find all materials co-occurring with thermoelectric that do not contain lead. +The result will be a list, with each element containing a list of [material, co-occurence counts, co-occurrence dois]. **Word embeddings** Materials science word embeddings trained using word2vec; details on how the embeddings were trained, -and their application in materials science discover can be found here: Tshitoyan et al., Nature, (accepted, -reference coming soon). +and their application in materials science discovery can be found in Ref. [2]. To get the word embedding for a given word, ```python @@ -115,7 +114,7 @@ embeddings = rester.get_embeddings("photovoltaics") This will return a dict containing the embedding. The word embedding will be a 200-dimensional array. The rester also has a close_words method (based on cosine similarity of embeddings) which can be used to -explore the semantic similarity of materials science terms; this approach can be used discover materials, +explore the semantic similarity of materials science terms; this approach can be used discover materials for a new application (as outlined in the reference above), To find words with a similar embedding to photovolatic: @@ -130,9 +129,9 @@ the close words and their cosine similarity to the input word. **Named Entity Recognition** In addition to the pre-processed entities present in our corpus, users can performed Named Entity -Recognition on any raw materials science text. The details of the model can be found here: +Recognition on any raw materials science text. The details of the model can be found in Ref. [1]. -The input should be a list of documents with the text represented as a string. +The input should be a list of documents with the text represented as a string: ```python doc_1 = "The bands gap of TiO2 is 3.2 eV. This was measured via photoluminescence" @@ -150,9 +149,9 @@ Each doc is a list of sentences; each sentence is a list of (word, tag) pairs. If you use any of the API functionality in your research, please consider citing the following papers where relevent: -[1] Tshitoyan et al., Nature (accepted) +[1] Weston et al., coming soon -[2] Weston et al., coming soon +[2] Tshitoyan et al., Nature (accepted) ## Contributors From 82286e51d96df8a4fbaf60d0d2c38afb394dc5cc Mon Sep 17 00:00:00 2001 From: LeighWeston86 Date: Thu, 23 May 2019 09:45:40 -0700 Subject: [PATCH 12/19] endpoint --- matscholar/rest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matscholar/rest.py b/matscholar/rest.py index 1790d1c..98e9ab7 100644 --- a/matscholar/rest.py +++ b/matscholar/rest.py @@ -47,7 +47,7 @@ class Rester(object): def __init__(self, api_key=None, endpoint=None): self.api_key = api_key if api_key else environ['MATERIALS_SCHOLAR_API_KEY'] - self.preamble = endpoint if endpoint else "http://0.0.0.0:8080" #environ['MATERIALS_SCHOLAR_ENDPOINT'] + self.preamble = endpoint if endpoint else environ['MATERIALS_SCHOLAR_ENDPOINT'] self.session = requests.Session() self.session.headers = {"x-api-key": self.api_key} From 3c5f064534928c074a59112d14cc0c3baf2e1405 Mon Sep 17 00:00:00 2001 From: Leigh Weston Date: Thu, 23 May 2019 15:24:57 -0700 Subject: [PATCH 13/19] Small changes to README.md --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8a51c30..2689209 100644 --- a/README.md +++ b/README.md @@ -56,9 +56,14 @@ descriptor, characterization, synthesis, phase. To get the raw text of abstracts matching a given query: ```python +# text match for "solid oxide fuel cells" example_text = "solid oxide fuel cells" + +# entity filters: include documents mentioning BaZrO3 and nanoparticles; +# exclude documents mentioning thin films example_entities = {"material": ["BaZrO3"], "descriptor": ["nanoparticle", "-thin film"]} -docs = rester.search_text_with_ents(text=example_entities, filters=example_entities) + +docs = rester.search_text_with_ents(text=example_text, filters=example_entities) ``` This will return a list of dictionaries containing the raw-text for each abstracts along with From 88e6a41654a32fe88d3d2bd8812dfd5bf13f82d8 Mon Sep 17 00:00:00 2001 From: Anubhav Jain Date: Fri, 31 May 2019 18:22:43 -0700 Subject: [PATCH 14/19] fix typo in README as far as i can tell, get_embeddings is get_embedding --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2689209..8615dc7 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ and their application in materials science discovery can be found in Ref. [2]. To get the word embedding for a given word, ```python -embeddings = rester.get_embeddings("photovoltaics") +embedding = rester.get_embedding("photovoltaics") ``` This will return a dict containing the embedding. The word embedding will be a 200-dimensional array. From cc518c2bbce44ef5bad464fd0b5100cb93c01b91 Mon Sep 17 00:00:00 2001 From: jdagdelen Date: Mon, 3 Jun 2019 16:15:11 -0700 Subject: [PATCH 15/19] fixing unidecode version error. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8491ecf..eead8be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ pyyaml requests pytest numpy -unidecode +unidecode==1.0.23 regex monty pymatgen From 0e34c5570eca7b1de1e131385206676f9fa3095c Mon Sep 17 00:00:00 2001 From: Leigh Weston Date: Tue, 4 Jun 2019 11:11:40 -0700 Subject: [PATCH 16/19] fix credits in rest.py --- matscholar/rest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matscholar/rest.py b/matscholar/rest.py index 98e9ab7..2273733 100644 --- a/matscholar/rest.py +++ b/matscholar/rest.py @@ -12,7 +12,7 @@ """ __author__ = "John Dagdelen" -__credits__ = "Shyue Ping Ong, Shreyas Cholia, Anubhav Jain" +__credits__ = "Leigh Weston, Amalie Trewartha, Vahe Tshitoyan" __copyright__ = "Copyright 2018, Materials Intelligence" __version__ = "0.1" __maintainer__ = "John Dagdelen" From a0de4483ec66033acfdce2f175564a28dcf9de62 Mon Sep 17 00:00:00 2001 From: cs464osu <49537751+cs464osu@users.noreply.github.com> Date: Wed, 5 Jun 2019 15:38:09 -0700 Subject: [PATCH 17/19] minor renaming --- matscholar/tests/test_rest.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/matscholar/tests/test_rest.py b/matscholar/tests/test_rest.py index 8f7ad3d..350b43f 100644 --- a/matscholar/tests/test_rest.py +++ b/matscholar/tests/test_rest.py @@ -7,9 +7,9 @@ class EmbeddingEngineTest(unittest.TestCase): r = Rester() - def test_materials_search(self): + def test_search_materials(self): - top_thermoelectrics = self.r.materials_search("thermoelectric", top_k=10) + top_thermoelectrics = self.r.search_materials("thermoelectric", top_k=10) self.assertListEqual(top_thermoelectrics["counts"], [2452, 9, 2598, 13, 5, 9, 831, 167, 8, 390]) self.assertListEqual(top_thermoelectrics["materials"], ['Bi2Te3', 'MgAgSb', 'PbTe', 'PbSe0.5Te0.5', @@ -63,7 +63,7 @@ def test_close_words(self): negatives, top_ks, ignore_missing, - close_words, + get_close_words, scores, processed_positives, processed_negatives): @@ -174,12 +174,12 @@ class EntSearchTest(unittest.TestCase): def test_ent_search(self): - result = self.rester.search_ents(self.test_query) + result = self.rester.search_entities(self.test_query) self.assertEqual(len(result), 1126) self.assertTrue(all(key in result[0].keys() for key in self.KEYS)) def test_summary(self): - result = self.rester.get_summary(self.test_query) + result = self.rester.search_entities_summary(self.test_query) self.assertEqual(result['MAT'][0][1], 1126) subkeys = [key for key in self.KEYS if key != 'doi'] self.assertTrue(all(key in result for key in subkeys)) @@ -190,7 +190,7 @@ class SimilarMaterialsTest(unittest.TestCase): def test_similar_materials(self): material = 'LiCoO2' - result = self.rester.get_similar_materials(material) + result = self.rester.get_close_materials(material) self.assertEqual(len(result), 10) similar_mats = ['CoLi2NiO4', 'Co3Li10Ni7O20', 'CoLi4Ni3O8', 'CoLi3MnO5', 'CoLi2O4Si', 'FeLiO2', 'CoLi3MnNiO6', 'CoLi10Ni9O20', 'CoLiMnO4', 'Fe2Li3O4P'] @@ -203,14 +203,14 @@ class NERTest(unittest.TestCase): "The lattice constant of ZnO is 3.8 A. This was measured using XRD."] def test_iob(self): - tagged_docs = self.rester.get_ner_tags(self.TEST_DOCS, return_type="iob") + tagged_docs = self.rester.perform_ner(self.TEST_DOCS, return_type="iob") print(tagged_docs) self.assertEqual(len(tagged_docs), 2) self.assertEqual(len(tagged_docs[0]), 2) self.assertEqual(tagged_docs[0][0][2][1], "B-MAT") def test_concatenated(self): - tagged_docs = self.rester.get_ner_tags(self.TEST_DOCS, return_type="concatenated") + tagged_docs = self.rester.perform_ner(self.TEST_DOCS, return_type="concatenated") self.assertEqual(len(tagged_docs), 2) self.assertEqual(len(tagged_docs[0]), 2) self.assertEqual(tagged_docs[0][0][2][1], "MAT") @@ -233,8 +233,8 @@ class MaterialSearchEntsTest(unittest.TestCase): "cutoff": None } - def test_materials_search(self): - result = self.rester.materials_search_ents(**self.TEST_QUERY) + def test_search_materials(self): + result = self.rester.search_materials_by_entities(**self.TEST_QUERY) self.assertEqual(result[0][0], "BaO3Ti") self.assertTrue(not any("Pb" in mat for mat, _, _ in result)) self.assertTrue(all("O" in mat for mat, _, _ in result)) From 40771e14b570e2f7aaa03f93e8388160b2637399 Mon Sep 17 00:00:00 2001 From: cs464osu <49537751+cs464osu@users.noreply.github.com> Date: Wed, 5 Jun 2019 15:38:39 -0700 Subject: [PATCH 18/19] minor renaming --- matscholar/rest.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/matscholar/rest.py b/matscholar/rest.py index 2273733..be9e5d8 100644 --- a/matscholar/rest.py +++ b/matscholar/rest.py @@ -89,7 +89,7 @@ def _make_request(self, sub_url, payload=None, method="GET"): if hasattr(response, "content") else str(ex) raise MatScholarRestError(msg) - def materials_search(self, positive, negative=None, ignore_missing=True, top_k=10): + def search_materials(self, positive, negative=None, ignore_missing=True, top_k=10): """ Given input strings or lists of positive and negative words / phrases, returns a ranked list of materials with corresponding scores and numbers of mentions @@ -112,7 +112,7 @@ def materials_search(self, positive, negative=None, ignore_missing=True, top_k=1 return self._make_request(sub_url, payload=payload, method=method) - def close_words(self, positive, negative=None, ignore_missing=True, top_k=10): + def get_close_words(self, positive, negative=None, ignore_missing=True, top_k=10): """ Given input strings or lists of positive and negative words / phrases, returns a list of most similar words / phrases according to cosine similarity @@ -188,7 +188,7 @@ def materials_map(self, highlight, limit=None, ignore_missing=True, number_to_su return self._make_request(sub_url, payload=payload, method=method) - def search_ents(self, query): + def search_entities(self, query): """ Get the entities in each document associated with a given query @@ -202,7 +202,7 @@ def search_ents(self, query): return self._make_request(sub_url, payload=payload, method=method) - def get_journals(self, query): + def get_close_journals(self, query): ''' :param query: string: a paragraph @@ -216,7 +216,7 @@ def get_journals(self, query): return self._make_request(sub_url, payload=payload, method=method) - def get_summary(self, query): + def search_entities_summary(self, query): """ Get a summary of the entities associated with a given query @@ -230,7 +230,7 @@ def get_summary(self, query): return self._make_request(sub_url, payload=payload, method=method) - def get_similar_materials(self, material): + def get_close_materials(self, material): """ Finds the most similar compositions in the corpus. @@ -241,7 +241,7 @@ def get_similar_materials(self, material): sub_url = '/materials/similar/{}'.format(material) return self._make_request(sub_url, method=method) - def get_ner_tags(self, docs, return_type="concatenated"): + def perform_ner(self, docs, return_type="concatenated"): """ Performs Named Entity Recognition. @@ -258,7 +258,7 @@ def get_ner_tags(self, docs, return_type="concatenated"): } return self._make_request(sub_url, payload=payload, method=method) - def materials_search_ents(self, entities, elements, cutoff=None): + def search_materials_by_entities(self, entities, elements, cutoff=None): """ Finds materials that co-occur with specified entities. The returned materials can be screened by specifying elements that must be included/excluded from the stoichiometry. @@ -282,7 +282,7 @@ def materials_search_ents(self, entities, elements, cutoff=None): } return self._make_request(sub_url, payload=payload, method=method) - def search_text_with_ents(self, text, filters, cutoff=None): + def search_documents(self, text, filters, cutoff=None): """ Search abstracts by text with filters for entities :param text: string; text to search From 58259dda00209e98288bd2bd564788751d17df17 Mon Sep 17 00:00:00 2001 From: cs464osu <49537751+cs464osu@users.noreply.github.com> Date: Wed, 5 Jun 2019 15:39:08 -0700 Subject: [PATCH 19/19] minor renaming --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 8615dc7..1d9674f 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ example_text = "solid oxide fuel cells" # exclude documents mentioning thin films example_entities = {"material": ["BaZrO3"], "descriptor": ["nanoparticle", "-thin film"]} -docs = rester.search_text_with_ents(text=example_text, filters=example_entities) +docs = rester.search_documents(text=example_text, filters=example_entities) ``` This will return a list of dictionaries containing the raw-text for each abstracts along with @@ -75,32 +75,32 @@ We have extracted materials-science named entities from nearly 3.5 million mater absracts. Details on how this was performed can be found in Ref. [1]. The extracted named entities for each document associated with a query are returned by the -search_ents method. This method takes as input a dictionary with entity types as keys and a list of entities +search_entities method. This method takes as input a dictionary with entity types as keys and a list of entities as values. For example, to find all of the entities that co-occur with the material "GaN": ```python -docs = rester.search_ents(query={"material": ["GaN"]}) +docs = rester.search_entities(query={"material": ["GaN"]}) ``` This wil return a list of dictionaries representing documents matching the query; each dict will contain the DOI as well as each unique entity found in the corresponding abstract. -A summary of the entities associated with a query can be generated using the get_summary method. To get +A summary of the entities associated with a query can be generated using the search_entities_summary method. To get statistics for entities co-occuring with GaN, ```python -summary = rester.get_summary(query={"material": ["GaN"]}) +summary = rester.search_entities_summary(query={"material": ["GaN"]}) ``` This will return a dictionary with entity types as keys; the values will be a list of the top entities that occur in documents matching the query, each item in the list will be [entity, document count, fraction]. -To perform a fast literature review, the materials_search_ents method may be used. For a chosen application, +To perform a fast literature review, the search_materials_by_entities method may be used. For a chosen application, this will return a list of all materials that co-occur with that application in our corpus. For example, to see which materials co-occur with the word thermoelectric in a document, ```python -mat_list = rester.materials_search_ents(["thermoelectric"], elements=["-Pb"], cutoff=None) +mat_list = rester.search_materials_by_entities(["thermoelectric"], elements=["-Pb"], cutoff=None) ``` The above search will find all materials co-occurring with thermoelectric that do not contain lead. @@ -118,14 +118,14 @@ embedding = rester.get_embedding("photovoltaics") This will return a dict containing the embedding. The word embedding will be a 200-dimensional array. -The rester also has a close_words method (based on cosine similarity of embeddings) which can be used to +The rester also has a get_close_words method (based on cosine similarity of embeddings) which can be used to explore the semantic similarity of materials science terms; this approach can be used discover materials for a new application (as outlined in the reference above), To find words with a similar embedding to photovolatic: ```python -close_words = rester.close_words("photovoltaics", top_k=1000) +close_words = rester.get_close_words("photovoltaics", top_k=1000) ``` This will return the 1000 closest words to photovoltaics. The result will be a dictionary containing @@ -142,7 +142,7 @@ The input should be a list of documents with the text represented as a string: doc_1 = "The bands gap of TiO2 is 3.2 eV. This was measured via photoluminescence" doc_2 = "We deposit GaN thin films using MOCVD" docs = [doc_1, doc_2] -tagged_docs = rester.get_ner_tags(docs, return_type="concatenated") +tagged_docs = rester.perform_ner(docs, return_type="concatenated") ``` The arguement return_type may be set to iob, concatenated, or normalized. The latter will replace