shortener WIP

ramalho · ramalho · commit b14fbe004415 · 2025-06-07T12:59:19.000-03:00
diff --git a/links/data/README.txt b/links/data/README.txt
@@ -0,0 +1 @@
+data files for testing, used by pytest-datadir
diff --git a/links/data/sample.htaccess b/links/data/sample.htaccess
@@ -0,0 +1,11 @@
+ErrorDocument 404 /404.html
+
+# main resources
+RedirectTemp /book	https://www.oreilly.com/.../9781492056348/
+RedirectTemp /home	https://www.fluentpython.com/  # extra content site
+
+# duplicate targets
+RedirectTemp /1-20	https://www.fluentpython.com/
+RedirectTemp /ora	https://www.oreilly.com/.../9781492056348/
+RedirectTemp /2-10	http://example.com/
+RedirectTemp /10-2	http://example.com/
diff --git a/links/prep1_shorten_urls.py b/links/prep1_shorten_urls.py
@@ -3,20 +3,20 @@
 import fileinput
 import re
 
-URL_RE = re.compile(r'''https?://[^\s[<>"']+''')
+URL_RE = re.compile(r"""https?://[^\s[<>"']+""")
+
 
 def find_urls(fpy=True, long=True):
     found = 0
     for line in (l.rstrip() for l in fileinput.input()):
         if match := URL_RE.search(line):
             url = match.group()
             is_fpy = '://fpy.li/' in url
-            if ((is_fpy and not fpy) or 
-                (not is_fpy and not long)):
+            if (is_fpy and not fpy) or (not is_fpy and not long):
                 continue
             print(url)
             found += 1
-    # print('FOUND', found, 'URLs')     
+    # print('FOUND', found, 'URLs')
 
 
 if __name__ == '__main__':
diff --git a/links/shorten.py b/links/shorten.py
@@ -28,13 +28,15 @@
 type RedirMap = dict[ShortCode, Url]
 type TargetMap = dict[Url, ShortCode]
 
+
 class ShortPair(NamedTuple):
     code: ShortCode
     url: Url
 
+
 def load_redirects() -> tuple[RedirMap, TargetMap]:
-    redirects:RedirMap = {}
-    targets:TargetMap = {}
+    redirects: RedirMap = {}
+    targets: TargetMap = {}
     for filename in HTACCESS_FILES:
         with open(filename) as fp:
             for line in fp:
@@ -43,8 +45,8 @@ def load_redirects() -> tuple[RedirMap, TargetMap]:
                     short = field1.encode('ascii')[1:]  # Remove leading slash
                     assert short not in redirects, f'{filename}: duplicate redirect from {short}'
                     # htaccess.custom is live since 2022, I can't change it to remove duplicate targets
-                    #if filename != HTACCESS_MAIN:
-                    #assert long not in targets, f'{filename}: duplicate redirect to {long}'
+                    # if filename != HTACCESS_MAIN:
+                    # assert long not in targets, f'{filename}: duplicate redirect to {long}'
                     if long in targets:
                         print(f'{filename}: duplicate redirect to {long}')
                     redirects[short] = long
@@ -56,7 +58,7 @@ def load_redirects() -> tuple[RedirMap, TargetMap]:
 SDIGITS = b'23456789abcdefghjkmnpqrstvwxyz'
 
 
-def gen_short(start_len=1) -> Iterator[ShortCode]:
+def gen_short(start_len=2) -> Iterator[ShortCode]:
     """Generate every possible sequence of SDIGITS, starting with start_len"""
     length = start_len
     while True:
@@ -65,9 +67,9 @@ def gen_short(start_len=1) -> Iterator[ShortCode]:
         length += 1
 
 
-def gen_unused_short(redirects: dict) -> Iterator[ShortCode]:
-    """Generate next available short URL of len >= 2."""
-    for short in gen_short(2):
+def gen_unused_short(redirects: dict, start_len=2) -> Iterator[ShortCode]:
+    """Generate next available short URL of len >= start_len."""
+    for short in gen_short(start_len):
         if short not in redirects:
             yield short
 
@@ -90,7 +92,7 @@ def shorten(urls: list[str]) -> list[ShortPair]:
                 if timestamp:
                     fp.write(f'\n# appended: {timestamp}\n')
                     timestamp = None
-                fp.write(f'RedirectTemp /{short.decode('ascii')} {long}\n')
+                fp.write(f'RedirectTemp /{short.decode("ascii")} {long}\n')
             pairs.append((short, long))
 
     return pairs
@@ -105,5 +107,5 @@ def main() -> None:
 
 
 if __name__ == '__main__':
-    #main()
+    # main()
     load_redirects()
diff --git a/links/shortener.py b/links/shortener.py
@@ -4,37 +4,15 @@
 This script reads a `.htaccess` file and a plain text file with
 URLs (the target URLs).
 
-It outputs a list of target URLs and their corresponding short URLs,
-made from paths in the FPI.LI domain like `/2d`, `/2e`, etc.
+It outputs a list of target URLs and corresponding paths
+for short URLs in the FPI.LI domain like `/2d`, `/2e`, etc.
 This list is used to replace the target URLs with short URLs
 in the `.adoc` files where the target URLs are used.
 
 If a target URL is not in the `.htaccess` file,
 the script generates a new short URL
-and appends a new `RedirectTemp` directive to the `.htaccess` file.
-
-
-## `.httaccess` file
-
-A file named `.htaccess` in this format is deployed to the web server
-at FPY.LI to redirect short URLs to target URLs (the longer ones).
-
-```
-# added: 2025-05-26 16:01:24
-RedirectTemp /2d https://mitpress.mit.edu/9780262111584/the-art-of-the-metaobject-protocol/
-RedirectTemp /2e https://dabeaz.com/per.html
-RedirectTemp /2f https://pythonfluente.com/2/#iter_closer_look
-
-```
-
-When a user agent requests a URL like `https://fpy.li/2d`,
-the web server responds with a 302 redirect to the longer URL
-`https://mitpress.mit.edu/9780262111584/the-art-of-the-metaobject-protocol/`.
-
-A temporary redirect (code 302)
-tells user agents to come back to the same URL at FPY.LI later,
-and not update their bookmark.
-This allows me update the target URL, if needed.
+and adds a new `RedirectTemp` directive to the `.htaccess` file,
+appending it in place with a timestamp.
 
 ## Redirects in memory
 
@@ -48,15 +26,18 @@
 but the algorithm is more complicated.
 
 The same target URL can be mapped to multiple short paths
-due to past mistakes when updating the `.htaccess` file.
+in `.htaccess` when the same target URL was added more
+than once with different short paths by mistake.
+We cannot fix these mistakes because the redundant
+short paths are printed in Fluent Python Second Edition.
 
 When loading the `.htaccess` file,
 if a target URL is already in the `targets` dict,
 we compare the existing short path with the new one
 and save the shorter one in the `targets` dict.
 
 That way, we ensure that the shortest path is used for each target URL
-in the list of replacements we output to apply to the `.adoc` files.
+in the list of replacements to apply to the `.adoc` files.
 
 
 ## Shortening URLs
@@ -65,31 +46,16 @@
 
 To shorten a target URL, find it in the `targets` dict.
 If the target URL is found:
-    use the existing path.
+    use the existing short path.
 If the target URL is not found:
     generate a new short path;
     store target and path in both `targets` and `redirects` dicts;
     collect new short path and target URL in a `new_redirects` list
-    to be appended to the `.htaccess` file later.
-Targets in memory
-
-To avoid generating a new short URL for a target URL,
-
-
-
-the `shortener` module provides a way to generate new short URLs
-
-
-Procedure:
-
-0. create empty dicts named targets and redirects
-1. given a target_url, find it in targets;
-    1.1. if found, use the short_url stored there
-    1.2. if not, generate new short_url and store it in targets and redirects
+    to be appended to the `.htaccess` file at the end of the process.
 
 """
 
-
+import itertools
 from collections.abc import Iterable, Iterator
 
 
@@ -111,6 +77,7 @@ def key(k: str) -> tuple[int, bool, list[str]]:
         if len(parts) > 1:
             parts = [(f'z{p:>08}' if p.isnumeric() else p) for p in parts]
         return len(k), '-' in k, parts
+
     return min(a, b, key=key)
 
 
@@ -125,4 +92,23 @@ def load_redirects(pairs: Iterable[tuple[str, str]]) -> tuple[dict, dict]:
         else:
             targets[url] = choose(short_url, existing_short_url)
 
-    return redirects, targets
+    return redirects, targets
+
+
+SDIGITS = '23456789abcdefghjkmnpqrstvwxyz'
+
+
+def gen_short(start_len=1) -> Iterator[str]:
+    """Generate every possible sequence of SDIGITS, starting with start_len"""
+    length = start_len
+    while True:
+        for digits in itertools.product(SDIGITS, repeat=length):
+            yield ''.join(digits)
+        length += 1
+
+
+def gen_unused_short(redirects: dict) -> Iterator[str]:
+    """Generate next available short URL of len >= 2."""
+    for short in gen_short(2):
+        if short not in redirects:
+            yield short
diff --git a/links/test_shortener.py b/links/test_shortener.py
@@ -1,43 +1,37 @@
 from pytest import mark
 
 from shortener import parse_htaccess, choose, load_redirects
+from shortener import gen_short, gen_unused_short
 
 
-HTACCESS_1 = """
-ErrorDocument 404 /404.html
+PARSED_SAMPLE_HTACCESS = [
+    ('book', 'https://www.oreilly.com/.../9781492056348/'),
+    ('home', 'https://www.fluentpython.com/'),
+    ('1-20', 'https://www.fluentpython.com/'),
+    ('ora', 'https://www.oreilly.com/.../9781492056348/'),
+    ('2-10', 'http://example.com/'),
+    ('10-2', 'http://example.com/'),
+]
 
-# main resources
-RedirectTemp /book	https://www.oreilly.com/.../9781492056348/
-RedirectTemp /home	https://www.fluentpython.com/  # extra content site
 
-# duplicate targets
-RedirectTemp /1-20	https://www.fluentpython.com/
-RedirectTemp /ora	https://www.oreilly.com/.../9781492056348/
-RedirectTemp /2-10	http://example.com/
-RedirectTemp /10-2	http://example.com/
-"""
+def test_parse_htaccess(shared_datadir):
+    with open(shared_datadir / 'sample.htaccess') as fp:
+        text = fp.read()
+    res = list(parse_htaccess(text))
+    assert res == PARSED_SAMPLE_HTACCESS
 
-PARSED_HTACCESS_1 = [
-        ('book', 'https://www.oreilly.com/.../9781492056348/'),
-        ('home', 'https://www.fluentpython.com/'),
-        ('1-20', 'https://www.fluentpython.com/'),
-        ('ora', 'https://www.oreilly.com/.../9781492056348/'),
-        ('2-10', 'http://example.com/'),
-        ('10-2', 'http://example.com/'),
-    ]
 
-def test_parse_htaccess():
-    res = list(parse_htaccess(HTACCESS_1))
-    assert res == PARSED_HTACCESS_1
-
-@mark.parametrize('a,b,expected', [
-    ('a', 'b', 'a'),
-    ('b', 'a', 'a'),
-    ('aa', 'a', 'a'),
-    ('a-a', 'aaa', 'aaa'),
-    ('2-10', '10-2', '2-10'),
-    ('p-1', '1-1', 'p-1'),
-])
+@mark.parametrize(
+    'a,b,expected',
+    [
+        ('a', 'b', 'a'),
+        ('b', 'a', 'a'),
+        ('aa', 'a', 'a'),
+        ('a-a', 'aaa', 'aaa'),
+        ('2-10', '10-2', '2-10'),
+        ('p-1', '1-1', 'p-1'),
+    ],
+)
 def test_choose(a, b, expected):
     res = choose(a, b)
     assert res == expected
@@ -52,15 +46,29 @@ def test_load_redirects():
         'book': 'https://www.oreilly.com/.../9781492056348/',
         'ora': 'https://www.oreilly.com/.../9781492056348/',
     }
-    redirects, _ = load_redirects(PARSED_HTACCESS_1)
+    redirects, _ = load_redirects(PARSED_SAMPLE_HTACCESS)
     assert redirects == expected
 
 
 def test_load_redirect_targets():
     expected = {
-        'https://www.fluentpython.com/' : 'home',
-        'https://www.oreilly.com/.../9781492056348/' : 'ora',
-        'http://example.com/' : '2-10',
+        'https://www.fluentpython.com/': 'home',
+        'https://www.oreilly.com/.../9781492056348/': 'ora',
+        'http://example.com/': '2-10',
     }
-    _, targets = load_redirects(PARSED_HTACCESS_1)
-    assert targets == expected
+    _, targets = load_redirects(PARSED_SAMPLE_HTACCESS)
+    assert targets == expected
+
+
+def test_gen_short():
+    expected = '222 223 224 225 226 227 228 229 22a 22b'.split()
+    gen = gen_short(3)
+    res = [next(gen) for _ in range(10)]
+    assert res == expected
+
+
+def test_gen_unused_short():
+    redirects = {'22': 'u1', '23': 'u2', '25': 'u4'}
+    gen = gen_unused_short(redirects)
+    assert next(gen) == '24'
+    assert next(gen) == '26'
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,14 @@
+anyio==4.9.0
+certifi==2025.4.26
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+idna==3.10
+iniconfig==2.1.0
+packaging==25.0
+pluggy==1.6.0
+Pygments==2.19.1
+pytest==8.4.0
+pytest-datadir==1.7.2
+ruff==0.11.11
+sniffio==1.3.1

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+data files for testing, used by pytest-datadir`