Skip to content

Commit dddbb93

Browse files
Use requests_html to implement ComposerPopularityFeature
This had stopped working because Google's response to the static UserAgent we were giving lacked result counts.
1 parent f4c1f1e commit dddbb93

2 files changed

Lines changed: 9 additions & 12 deletions

File tree

music21/features/native.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import math
1717
from typing import Optional
1818

19-
from urllib.request import Request, urlopen
2019
from urllib.parse import urlencode
2120

2221

@@ -847,20 +846,19 @@ class ComposerPopularity(featuresModule.FeatureExtractor):
847846
848847
Requires an internet connection.
849848
849+
Changed in v7 -- implementation uses the package `requests_html`, which must
850+
be installed.
850851
851852
>>> #_DOCS_SHOW s = corpus.parse('mozart/k155', 2)
852853
>>> s = stream.Score() #_DOCS_HIDE
853854
>>> s.append(metadata.Metadata()) #_DOCS_HIDE
854855
>>> s.metadata.composer = 'W.A. Mozart' #_DOCS_HIDE
855856
>>> fe = features.native.ComposerPopularity(s)
856-
>>> #_DOCS_SHOW fe.extract().vector[0] > 5.0
857-
>>> True #_DOCS_HIDE
857+
>>> fe.extract().vector[0] > 5.0
858858
True
859859
'''
860860
id = 'MD1'
861861
googleResultsRE = re.compile(r'([\d,]+) results')
862-
_M21UserAgent = ('Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) '
863-
+ 'Gecko/20071127 Firefox/2.0.0.11')
864862

865863
def __init__(self, dataOrStream=None, *arguments, **keywords):
866864
super().__init__(dataOrStream=dataOrStream, *arguments, **keywords)
@@ -888,13 +886,11 @@ def process(self):
888886
params = urlencode(paramsBasic)
889887
urlStr = f'http://www.google.com/search?{params}'
890888

891-
headers = {'User-Agent': self._M21UserAgent}
892-
req = Request(urlStr, headers=headers)
893-
with urlopen(req) as response:
894-
the_page = response.read()
895-
the_page = the_page.decode('utf-8')
896-
897-
m = self.googleResultsRE.search(the_page)
889+
from requests_html import HTMLSession
890+
session = HTMLSession()
891+
response = session.get(urlStr)
892+
resultsDiv = response.html.find('div[@id="result-stats"]', first=True)
893+
m = self.googleResultsRE.search(resultsDiv.text)
898894
if m is not None and m.group(0):
899895
totalRes = int(m.group(1).replace(',', ''))
900896
if totalRes > 0:

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ jsonpickle
44
matplotlib
55
more_itertools
66
numpy
7+
requests_html
78
webcolors>=1.5

0 commit comments

Comments
 (0)