Skip to content

Commit 7ab5b35

Browse files
authored
Feature/2024 updates (#101)
* Use new pyproject.toml * Remove old setup.py * Update README to reflect new cli script name seoanalyzer * Update requirements.txt to include much newer versions of necessary packages and remove requests * Refactor everything into pyseoanalyzer directory
1 parent 55fd7b3 commit 7ab5b35

20 files changed

Lines changed: 1167 additions & 737 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# I don't want the python virtual env in github!
22
venv
3+
env
34

45
# nor visual
56
.vscode

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ print(output)
7878
Alternatively, you can run the analysis as a script from the seoanalyzer folder.
7979

8080
```sh
81-
python analyzer.py https://www.sethserver.com/ -f html > results.html
81+
python -m seoanalyzer https://www.sethserver.com/ -f html > results.html
8282
```
8383

8484
Notes

pyproject.toml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
[build-system]
2+
requires = ["hatchling"]
3+
build-backend = "hatchling.build"
4+
5+
[project]
6+
name = "pyseoanalyzer"
7+
version = "2024.04.21"
8+
authors = [
9+
{name = "Seth Black", email = "sblack@sethserver.com"},
10+
]
11+
dependencies = [
12+
"beautifulsoup4>=4.12.3",
13+
"certifi>=2024.2.2",
14+
"Jinja2>=3.1.3",
15+
"lxml>=5.2.1",
16+
"MarkupSafe>=2.1.5",
17+
"urllib3>=2.2.1",
18+
]
19+
requires-python = ">= 3.8"
20+
description = "An SEO tool that analyzes the structure of a site, crawls the site, count words in the body of the site and warns of any technical SEO issues."
21+
readme = "README.md"
22+
license = {file = "LICENSE"}
23+
keywords = [
24+
"search engine optimization",
25+
"seo",
26+
"website parser",
27+
"crawler",
28+
"scraper",
29+
"site analyzer",
30+
"site parser",
31+
"site crawler",
32+
]
33+
classifiers = [
34+
"Development Status :: 5 - Production/Stable",
35+
"Programming Language :: Python",
36+
"Programming Language :: Python :: 3",
37+
"Programming Language :: Python :: 3 :: Only",
38+
"Environment :: Console",
39+
"Intended Audience :: Developers",
40+
"License :: OSI Approved :: BSD License",
41+
"Operating System :: OS Independent",
42+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
43+
"Topic :: Software Development :: Libraries :: Python Modules",
44+
"Topic :: Text Processing",
45+
"Topic :: Internet :: WWW/HTTP",
46+
]
47+
48+
[project.scripts]
49+
seoanalyze = "pyseoanalyzer.__main__:main"
50+
51+
[project.urls]
52+
Homepage = "https://github.com/sethblack/python-seo-analyzer"
53+
Repository = "https://github.com/sethblack/python-seo-analyzer.git"
54+
Issues = "https://github.com/sethblack/python-seo-analyzer/issues"

pyseoanalyzer/__main__.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import inspect
5+
import json
6+
import os
7+
8+
from .analyzer import analyze
9+
10+
11+
def main():
12+
module_path = os.path.dirname(inspect.getfile(analyze))
13+
14+
arg_parser = argparse.ArgumentParser()
15+
16+
arg_parser.add_argument("site", help="URL of the site you are wanting to analyze.")
17+
arg_parser.add_argument(
18+
"-s", "--sitemap", help="URL of the sitemap to seed the crawler with."
19+
)
20+
arg_parser.add_argument(
21+
"-f",
22+
"--output-format",
23+
help="Output format.",
24+
choices=[
25+
"json",
26+
"html",
27+
],
28+
default="json",
29+
)
30+
31+
arg_parser.add_argument(
32+
"--analyze-headings",
33+
default=False,
34+
action="store_true",
35+
help="Analyze heading tags (h1-h6).",
36+
)
37+
arg_parser.add_argument(
38+
"--analyze-extra-tags",
39+
default=False,
40+
action="store_true",
41+
help="Analyze other extra additional tags.",
42+
)
43+
arg_parser.add_argument(
44+
"--no-follow-links",
45+
default=True,
46+
action="store_false",
47+
help="Analyze all the existing inner links as well (might be time consuming).",
48+
)
49+
50+
args = arg_parser.parse_args()
51+
52+
output = analyze(
53+
args.site,
54+
args.sitemap,
55+
analyze_headings=args.analyze_headings,
56+
analyze_extra_tags=args.analyze_extra_tags,
57+
follow_links=args.no_follow_links,
58+
)
59+
60+
if args.output_format == "html":
61+
from jinja2 import Environment
62+
from jinja2 import FileSystemLoader
63+
64+
env = Environment(
65+
loader=FileSystemLoader(os.path.join(module_path, "templates"))
66+
)
67+
template = env.get_template("index.html")
68+
output_from_parsed_template = template.render(result=output)
69+
print(output_from_parsed_template)
70+
elif args.output_format == "json":
71+
print(json.dumps(output, indent=4, separators=(",", ": ")))
72+
73+
74+
if __name__ == "__main__":
75+
main()

pyseoanalyzer/analyzer.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import time
2+
3+
from operator import itemgetter
4+
from .website import Website
5+
6+
7+
def analyze(
8+
url,
9+
sitemap_url=None,
10+
analyze_headings=False,
11+
analyze_extra_tags=False,
12+
follow_links=True,
13+
):
14+
start_time = time.time()
15+
16+
def calc_total_time():
17+
return time.time() - start_time
18+
19+
output = {
20+
"pages": [],
21+
"keywords": [],
22+
"errors": [],
23+
"total_time": calc_total_time(),
24+
}
25+
26+
site = Website(
27+
url,
28+
sitemap_url,
29+
analyze_headings,
30+
analyze_extra_tags,
31+
follow_links,
32+
)
33+
34+
site.crawl()
35+
36+
for p in site.crawled_pages:
37+
output["pages"].append(p.talk())
38+
39+
output["duplicate_pages"] = [
40+
list(site.content_hashes[p])
41+
for p in site.content_hashes
42+
if len(site.content_hashes[p]) > 1
43+
]
44+
45+
sorted_words = sorted(site.wordcount.items(), key=itemgetter(1), reverse=True)
46+
sorted_bigrams = sorted(site.bigrams.items(), key=itemgetter(1), reverse=True)
47+
sorted_trigrams = sorted(site.trigrams.items(), key=itemgetter(1), reverse=True)
48+
49+
output["keywords"] = []
50+
51+
for w in sorted_words:
52+
if w[1] > 4:
53+
output["keywords"].append(
54+
{
55+
"word": w[0],
56+
"count": w[1],
57+
}
58+
)
59+
60+
for w, v in sorted_bigrams:
61+
if v > 4:
62+
output["keywords"].append(
63+
{
64+
"word": w,
65+
"count": v,
66+
}
67+
)
68+
69+
for w, v in sorted_trigrams:
70+
if v > 4:
71+
output["keywords"].append(
72+
{
73+
"word": w,
74+
"count": v,
75+
}
76+
)
77+
78+
# Sort one last time...
79+
output["keywords"] = sorted(
80+
output["keywords"], key=itemgetter("count"), reverse=True
81+
)
82+
83+
output["total_time"] = calc_total_time()
84+
85+
return output
Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,19 @@
33
from urllib3 import Timeout
44

55

6-
class Http():
6+
class Http:
77
def __init__(self):
8-
user_agent = {'User-Agent': 'Mozilla/5.0'}
8+
user_agent = {"User-Agent": "Mozilla/5.0"}
9+
910
self.http = PoolManager(
1011
timeout=Timeout(connect=1.0, read=2.0),
11-
cert_reqs='CERT_REQUIRED',
12+
cert_reqs="CERT_REQUIRED",
1213
ca_certs=certifi.where(),
13-
headers=user_agent
14+
headers=user_agent,
1415
)
1516

1617
def get(self, url):
17-
return self.http.request('GET', url)
18+
return self.http.request("GET", url)
19+
1820

1921
http = Http()
20-

0 commit comments

Comments
 (0)