From c226e7c1e57a76e8783989da98995f584a0369db Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Thu, 20 Feb 2025 17:38:01 +0100
Subject: [PATCH] Implement "facet shortering" for bluesky posting

Turns out bluesky doesn't actually shorten URLs if they are posted
through the API even if they have a matching "facet", and the example
code they have ignores this. So we have to implement our own that
basically shortens the "inside" of a facet to an appropriate length.
We'll re-use the twitter-url-shortener-length to make things predictable
between providers.
---
 postgresqleu/util/messaging/bluesky.py | 46 ++++++++++++++++++--------
 postgresqleu/util/messaging/short.py   | 12 +++----
 2 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/postgresqleu/util/messaging/bluesky.py b/postgresqleu/util/messaging/bluesky.py
index e202758b..f556875c 100644
--- a/postgresqleu/util/messaging/bluesky.py
+++ b/postgresqleu/util/messaging/bluesky.py
@@ -7,6 +7,7 @@ import requests
 
 from postgresqleu.util.image import get_image_contenttype_from_bytes
 from postgresqleu.util.versionutil import decode_unverified_jwt
+from postgresqleu.util.messaging.short import url_shortened_len
 
 from postgresqleu.confreg.models import MessagingProvider
 from postgresqleu.confreg.backendforms import BackendSeriesMessagingForm
@@ -147,8 +148,9 @@ class Bluesky(object):
             "createdAt": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
         }
 
-        facets = self._parse_facets(post["text"])
+        newtext, facets = self._parse_facets(post["text"])
         if facets:
+            post["text"] = newtext
             post["facets"] = facets
 
         if image:
@@ -231,20 +233,34 @@ class Bluesky(object):
 
     # From Bluesky examples
     def _parse_urls(self, text: str):
-        spans = []
         # partial/naive URL regex based on: https://stackoverflow.com/a/3809435
         # tweaked to disallow some training punctuation
-        url_regex = rb"[$|\W](https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)"
+        url_regex = re.compile(rb"([$|\W])(https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)")
         text_bytes = text.encode("UTF-8")
-        for m in re.finditer(url_regex, text_bytes):
-            spans.append(
-                {
-                    "start": m.start(1),
-                    "end": m.end(1),
-                    "url": m.group(1).decode("UTF-8"),
-                }
-            )
-        return spans
+
+        class _url_replacer:
+            def __init__(self):
+                self.spans = []
+                self.shortenedby = 0
+
+            def replace(self, match: bytes):
+                if len(match.group(2)) <= url_shortened_len:
+                    url = match.group(2)
+                else:
+                    url = match.group(2)[:url_shortened_len - 3] + b'...'
+                end = match.start(2) + len(url)
+                self.spans.append({
+                    'start': match.start(2) - self.shortenedby,
+                    'end': end - self.shortenedby,
+                    'url': match.group(2),
+                })
+                self.shortenedby += len(match.group(2)) - len(url)
+                return match.group(1) + url
+
+        replacer = _url_replacer()
+        r, n = url_regex.subn(replacer.replace, text_bytes)
+
+        return r.decode('UTF-8'), replacer.spans
 
     def _parse_mentions(self, text: str):
         spans = []
@@ -287,7 +303,9 @@ class Bluesky(object):
                     "features": [{"$type": "app.bsky.richtext.facet#mention", "did": did}],
                 }
             )
-        for u in self._parse_urls(text):
+        newtext, urls = self._parse_urls(text)
+
+        for u in urls:
             facets.append(
                 {
                     "index": {
@@ -303,4 +321,4 @@ class Bluesky(object):
                     ],
                 }
             )
-        return facets
+        return newtext, facets
diff --git a/postgresqleu/util/messaging/short.py b/postgresqleu/util/messaging/short.py
index 7b54528b..966a7de0 100644
--- a/postgresqleu/util/messaging/short.py
+++ b/postgresqleu/util/messaging/short.py
@@ -10,8 +10,8 @@ _re_urlmatcher = re.compile(r'\bhttps?://\S+', re.I)
 
 # This is currently the value for Twitter and the default for Mastodon, so just
 # use that globally for now.
-_url_shortened_len = 23
-_url_counts_as_characters = "https://short.url/{}".format((_url_shortened_len - len("https://short.url/")) * 'x')
+url_shortened_len = 23
+_url_counts_as_characters = "https://short.url/{}".format((url_shortened_len - len("https://short.url/")) * 'x')
 
 
 def get_shortened_post_length(txt):
@@ -28,12 +28,12 @@ def truncate_shortened_post(txt, maxlen):
         return txt[:maxlen]
 
     firststart, firstend = matches[0].span()
-    if firststart + _url_shortened_len > maxlen:
+    if firststart + url_shortened_len > maxlen:
         # We hit the size limit before the url or in the middle of it, so skip the whole url
         return txt[:firststart]
 
     inlen = firstend
-    outlen = firststart + _url_shortened_len
+    outlen = firststart + url_shortened_len
     for i, curr in enumerate(matches[1:]):
         prevstart, prevend = matches[i].span()
         currstart, currend = curr.span()
@@ -43,13 +43,13 @@ def truncate_shortened_post(txt, maxlen):
             # The limit was hit in the text between urls
             left = maxlen - outlen
             return txt[:inlen + (maxlen - outlen)]
-        if outlen + betweenlen + _url_shortened_len > maxlen:
+        if outlen + betweenlen + url_shortened_len > maxlen:
             # The limit was hit in the middle of this URL, so include all the text
             # up to it, but skip the url.
             return txt[:inlen + betweenlen]
 
         # The whole URL fit
         inlen += betweenlen + currend - currstart
-        outlen += betweenlen + _url_shortened_len
+        outlen += betweenlen + url_shortened_len
 
     return txt[:inlen + (maxlen - outlen)]
-- 
2.39.5