Skip to content

Commit b14fbe0

Browse files
committed
shortener WIP
1 parent 5c878e5 commit b14fbe0

File tree

7 files changed

+120
-98
lines changed

7 files changed

+120
-98
lines changed

links/data/README.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
data files for testing, used by pytest-datadir

links/data/sample.htaccess

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
ErrorDocument 404 /404.html
2+
3+
# main resources
4+
RedirectTemp /book https://www.oreilly.com/.../9781492056348/
5+
RedirectTemp /home https://www.fluentpython.com/ # extra content site
6+
7+
# duplicate targets
8+
RedirectTemp /1-20 https://www.fluentpython.com/
9+
RedirectTemp /ora https://www.oreilly.com/.../9781492056348/
10+
RedirectTemp /2-10 http://example.com/
11+
RedirectTemp /10-2 http://example.com/

links/prep1_shorten_urls.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,20 @@
33
import fileinput
44
import re
55

6-
URL_RE = re.compile(r'''https?://[^\s[<>"']+''')
6+
URL_RE = re.compile(r"""https?://[^\s[<>"']+""")
7+
78

89
def find_urls(fpy=True, long=True):
910
found = 0
1011
for line in (l.rstrip() for l in fileinput.input()):
1112
if match := URL_RE.search(line):
1213
url = match.group()
1314
is_fpy = '://fpy.li/' in url
14-
if ((is_fpy and not fpy) or
15-
(not is_fpy and not long)):
15+
if (is_fpy and not fpy) or (not is_fpy and not long):
1616
continue
1717
print(url)
1818
found += 1
19-
# print('FOUND', found, 'URLs')
19+
# print('FOUND', found, 'URLs')
2020

2121

2222
if __name__ == '__main__':

links/shorten.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,15 @@
2828
type RedirMap = dict[ShortCode, Url]
2929
type TargetMap = dict[Url, ShortCode]
3030

31+
3132
class ShortPair(NamedTuple):
3233
code: ShortCode
3334
url: Url
3435

36+
3537
def load_redirects() -> tuple[RedirMap, TargetMap]:
36-
redirects:RedirMap = {}
37-
targets:TargetMap = {}
38+
redirects: RedirMap = {}
39+
targets: TargetMap = {}
3840
for filename in HTACCESS_FILES:
3941
with open(filename) as fp:
4042
for line in fp:
@@ -43,8 +45,8 @@ def load_redirects() -> tuple[RedirMap, TargetMap]:
4345
short = field1.encode('ascii')[1:] # Remove leading slash
4446
assert short not in redirects, f'{filename}: duplicate redirect from {short}'
4547
# htaccess.custom is live since 2022, I can't change it to remove duplicate targets
46-
#if filename != HTACCESS_MAIN:
47-
#assert long not in targets, f'{filename}: duplicate redirect to {long}'
48+
# if filename != HTACCESS_MAIN:
49+
# assert long not in targets, f'{filename}: duplicate redirect to {long}'
4850
if long in targets:
4951
print(f'{filename}: duplicate redirect to {long}')
5052
redirects[short] = long
@@ -56,7 +58,7 @@ def load_redirects() -> tuple[RedirMap, TargetMap]:
5658
SDIGITS = b'23456789abcdefghjkmnpqrstvwxyz'
5759

5860

59-
def gen_short(start_len=1) -> Iterator[ShortCode]:
61+
def gen_short(start_len=2) -> Iterator[ShortCode]:
6062
"""Generate every possible sequence of SDIGITS, starting with start_len"""
6163
length = start_len
6264
while True:
@@ -65,9 +67,9 @@ def gen_short(start_len=1) -> Iterator[ShortCode]:
6567
length += 1
6668

6769

68-
def gen_unused_short(redirects: dict) -> Iterator[ShortCode]:
69-
"""Generate next available short URL of len >= 2."""
70-
for short in gen_short(2):
70+
def gen_unused_short(redirects: dict, start_len=2) -> Iterator[ShortCode]:
71+
"""Generate next available short URL of len >= start_len."""
72+
for short in gen_short(start_len):
7173
if short not in redirects:
7274
yield short
7375

@@ -90,7 +92,7 @@ def shorten(urls: list[str]) -> list[ShortPair]:
9092
if timestamp:
9193
fp.write(f'\n# appended: {timestamp}\n')
9294
timestamp = None
93-
fp.write(f'RedirectTemp /{short.decode('ascii')} {long}\n')
95+
fp.write(f'RedirectTemp /{short.decode("ascii")} {long}\n')
9496
pairs.append((short, long))
9597

9698
return pairs
@@ -105,5 +107,5 @@ def main() -> None:
105107

106108

107109
if __name__ == '__main__':
108-
#main()
110+
# main()
109111
load_redirects()

links/shortener.py

Lines changed: 33 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -4,37 +4,15 @@
44
This script reads a `.htaccess` file and a plain text file with
55
URLs (the target URLs).
66
7-
It outputs a list of target URLs and their corresponding short URLs,
8-
made from paths in the FPI.LI domain like `/2d`, `/2e`, etc.
7+
It outputs a list of target URLs and corresponding paths
8+
for short URLs in the FPI.LI domain like `/2d`, `/2e`, etc.
99
This list is used to replace the target URLs with short URLs
1010
in the `.adoc` files where the target URLs are used.
1111
1212
If a target URL is not in the `.htaccess` file,
1313
the script generates a new short URL
14-
and appends a new `RedirectTemp` directive to the `.htaccess` file.
15-
16-
17-
## `.httaccess` file
18-
19-
A file named `.htaccess` in this format is deployed to the web server
20-
at FPY.LI to redirect short URLs to target URLs (the longer ones).
21-
22-
```
23-
# added: 2025-05-26 16:01:24
24-
RedirectTemp /2d https://mitpress.mit.edu/9780262111584/the-art-of-the-metaobject-protocol/
25-
RedirectTemp /2e https://dabeaz.com/per.html
26-
RedirectTemp /2f https://pythonfluente.com/2/#iter_closer_look
27-
28-
```
29-
30-
When a user agent requests a URL like `https://fpy.li/2d`,
31-
the web server responds with a 302 redirect to the longer URL
32-
`https://mitpress.mit.edu/9780262111584/the-art-of-the-metaobject-protocol/`.
33-
34-
A temporary redirect (code 302)
35-
tells user agents to come back to the same URL at FPY.LI later,
36-
and not update their bookmark.
37-
This allows me update the target URL, if needed.
14+
and adds a new `RedirectTemp` directive to the `.htaccess` file,
15+
appending it in place with a timestamp.
3816
3917
## Redirects in memory
4018
@@ -48,15 +26,18 @@
4826
but the algorithm is more complicated.
4927
5028
The same target URL can be mapped to multiple short paths
51-
due to past mistakes when updating the `.htaccess` file.
29+
in `.htaccess` when the same target URL was added more
30+
than once with different short paths by mistake.
31+
We cannot fix these mistakes because the redundant
32+
short paths are printed in Fluent Python Second Edition.
5233
5334
When loading the `.htaccess` file,
5435
if a target URL is already in the `targets` dict,
5536
we compare the existing short path with the new one
5637
and save the shorter one in the `targets` dict.
5738
5839
That way, we ensure that the shortest path is used for each target URL
59-
in the list of replacements we output to apply to the `.adoc` files.
40+
in the list of replacements to apply to the `.adoc` files.
6041
6142
6243
## Shortening URLs
@@ -65,31 +46,16 @@
6546
6647
To shorten a target URL, find it in the `targets` dict.
6748
If the target URL is found:
68-
use the existing path.
49+
use the existing short path.
6950
If the target URL is not found:
7051
generate a new short path;
7152
store target and path in both `targets` and `redirects` dicts;
7253
collect new short path and target URL in a `new_redirects` list
73-
to be appended to the `.htaccess` file later.
74-
Targets in memory
75-
76-
To avoid generating a new short URL for a target URL,
77-
78-
79-
80-
the `shortener` module provides a way to generate new short URLs
81-
82-
83-
Procedure:
84-
85-
0. create empty dicts named targets and redirects
86-
1. given a target_url, find it in targets;
87-
1.1. if found, use the short_url stored there
88-
1.2. if not, generate new short_url and store it in targets and redirects
54+
to be appended to the `.htaccess` file at the end of the process.
8955
9056
"""
9157

92-
58+
import itertools
9359
from collections.abc import Iterable, Iterator
9460

9561

@@ -111,6 +77,7 @@ def key(k: str) -> tuple[int, bool, list[str]]:
11177
if len(parts) > 1:
11278
parts = [(f'z{p:>08}' if p.isnumeric() else p) for p in parts]
11379
return len(k), '-' in k, parts
80+
11481
return min(a, b, key=key)
11582

11683

@@ -125,4 +92,23 @@ def load_redirects(pairs: Iterable[tuple[str, str]]) -> tuple[dict, dict]:
12592
else:
12693
targets[url] = choose(short_url, existing_short_url)
12794

128-
return redirects, targets
95+
return redirects, targets
96+
97+
98+
SDIGITS = '23456789abcdefghjkmnpqrstvwxyz'
99+
100+
101+
def gen_short(start_len=1) -> Iterator[str]:
102+
"""Generate every possible sequence of SDIGITS, starting with start_len"""
103+
length = start_len
104+
while True:
105+
for digits in itertools.product(SDIGITS, repeat=length):
106+
yield ''.join(digits)
107+
length += 1
108+
109+
110+
def gen_unused_short(redirects: dict) -> Iterator[str]:
111+
"""Generate next available short URL of len >= 2."""
112+
for short in gen_short(2):
113+
if short not in redirects:
114+
yield short

links/test_shortener.py

Lines changed: 45 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,37 @@
11
from pytest import mark
22

33
from shortener import parse_htaccess, choose, load_redirects
4+
from shortener import gen_short, gen_unused_short
45

56

6-
HTACCESS_1 = """
7-
ErrorDocument 404 /404.html
7+
PARSED_SAMPLE_HTACCESS = [
8+
('book', 'https://www.oreilly.com/.../9781492056348/'),
9+
('home', 'https://www.fluentpython.com/'),
10+
('1-20', 'https://www.fluentpython.com/'),
11+
('ora', 'https://www.oreilly.com/.../9781492056348/'),
12+
('2-10', 'http://example.com/'),
13+
('10-2', 'http://example.com/'),
14+
]
815

9-
# main resources
10-
RedirectTemp /book https://www.oreilly.com/.../9781492056348/
11-
RedirectTemp /home https://www.fluentpython.com/ # extra content site
1216

13-
# duplicate targets
14-
RedirectTemp /1-20 https://www.fluentpython.com/
15-
RedirectTemp /ora https://www.oreilly.com/.../9781492056348/
16-
RedirectTemp /2-10 http://example.com/
17-
RedirectTemp /10-2 http://example.com/
18-
"""
17+
def test_parse_htaccess(shared_datadir):
18+
with open(shared_datadir / 'sample.htaccess') as fp:
19+
text = fp.read()
20+
res = list(parse_htaccess(text))
21+
assert res == PARSED_SAMPLE_HTACCESS
1922

20-
PARSED_HTACCESS_1 = [
21-
('book', 'https://www.oreilly.com/.../9781492056348/'),
22-
('home', 'https://www.fluentpython.com/'),
23-
('1-20', 'https://www.fluentpython.com/'),
24-
('ora', 'https://www.oreilly.com/.../9781492056348/'),
25-
('2-10', 'http://example.com/'),
26-
('10-2', 'http://example.com/'),
27-
]
2823

29-
def test_parse_htaccess():
30-
res = list(parse_htaccess(HTACCESS_1))
31-
assert res == PARSED_HTACCESS_1
32-
33-
@mark.parametrize('a,b,expected', [
34-
('a', 'b', 'a'),
35-
('b', 'a', 'a'),
36-
('aa', 'a', 'a'),
37-
('a-a', 'aaa', 'aaa'),
38-
('2-10', '10-2', '2-10'),
39-
('p-1', '1-1', 'p-1'),
40-
])
24+
@mark.parametrize(
25+
'a,b,expected',
26+
[
27+
('a', 'b', 'a'),
28+
('b', 'a', 'a'),
29+
('aa', 'a', 'a'),
30+
('a-a', 'aaa', 'aaa'),
31+
('2-10', '10-2', '2-10'),
32+
('p-1', '1-1', 'p-1'),
33+
],
34+
)
4135
def test_choose(a, b, expected):
4236
res = choose(a, b)
4337
assert res == expected
@@ -52,15 +46,29 @@ def test_load_redirects():
5246
'book': 'https://www.oreilly.com/.../9781492056348/',
5347
'ora': 'https://www.oreilly.com/.../9781492056348/',
5448
}
55-
redirects, _ = load_redirects(PARSED_HTACCESS_1)
49+
redirects, _ = load_redirects(PARSED_SAMPLE_HTACCESS)
5650
assert redirects == expected
5751

5852

5953
def test_load_redirect_targets():
6054
expected = {
61-
'https://www.fluentpython.com/' : 'home',
62-
'https://www.oreilly.com/.../9781492056348/' : 'ora',
63-
'http://example.com/' : '2-10',
55+
'https://www.fluentpython.com/': 'home',
56+
'https://www.oreilly.com/.../9781492056348/': 'ora',
57+
'http://example.com/': '2-10',
6458
}
65-
_, targets = load_redirects(PARSED_HTACCESS_1)
66-
assert targets == expected
59+
_, targets = load_redirects(PARSED_SAMPLE_HTACCESS)
60+
assert targets == expected
61+
62+
63+
def test_gen_short():
64+
expected = '222 223 224 225 226 227 228 229 22a 22b'.split()
65+
gen = gen_short(3)
66+
res = [next(gen) for _ in range(10)]
67+
assert res == expected
68+
69+
70+
def test_gen_unused_short():
71+
redirects = {'22': 'u1', '23': 'u2', '25': 'u4'}
72+
gen = gen_unused_short(redirects)
73+
assert next(gen) == '24'
74+
assert next(gen) == '26'

requirements.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
anyio==4.9.0
2+
certifi==2025.4.26
3+
h11==0.16.0
4+
httpcore==1.0.9
5+
httpx==0.28.1
6+
idna==3.10
7+
iniconfig==2.1.0
8+
packaging==25.0
9+
pluggy==1.6.0
10+
Pygments==2.19.1
11+
pytest==8.4.0
12+
pytest-datadir==1.7.2
13+
ruff==0.11.11
14+
sniffio==1.3.1

0 commit comments

Comments
 (0)