From c226e7c1e57a76e8783989da98995f584a0369db Mon Sep 17 00:00:00 2001 From: Magnus Hagander Date: Thu, 20 Feb 2025 17:38:01 +0100 Subject: [PATCH] Implement "facet shortering" for bluesky posting Turns out bluesky doesn't actually shorten URLs if they are posted through the API even if they have a matching "facet", and the example code they have ignores this. So we have to implement our own that basically shortens the "inside" of a facet to an appropriate length. We'll re-use the twitter-url-shortener-length to make things predictable between providers. --- postgresqleu/util/messaging/bluesky.py | 46 ++++++++++++++++++-------- postgresqleu/util/messaging/short.py | 12 +++---- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/postgresqleu/util/messaging/bluesky.py b/postgresqleu/util/messaging/bluesky.py index e202758b..f556875c 100644 --- a/postgresqleu/util/messaging/bluesky.py +++ b/postgresqleu/util/messaging/bluesky.py @@ -7,6 +7,7 @@ import requests from postgresqleu.util.image import get_image_contenttype_from_bytes from postgresqleu.util.versionutil import decode_unverified_jwt +from postgresqleu.util.messaging.short import url_shortened_len from postgresqleu.confreg.models import MessagingProvider from postgresqleu.confreg.backendforms import BackendSeriesMessagingForm @@ -147,8 +148,9 @@ class Bluesky(object): "createdAt": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), } - facets = self._parse_facets(post["text"]) + newtext, facets = self._parse_facets(post["text"]) if facets: + post["text"] = newtext post["facets"] = facets if image: @@ -231,20 +233,34 @@ class Bluesky(object): # From Bluesky examples def _parse_urls(self, text: str): - spans = [] # partial/naive URL regex based on: https://stackoverflow.com/a/3809435 # tweaked to disallow some training punctuation - url_regex = rb"[$|\W](https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)" + url_regex = re.compile(rb"([$|\W])(https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)") text_bytes = text.encode("UTF-8") - for m in re.finditer(url_regex, text_bytes): - spans.append( - { - "start": m.start(1), - "end": m.end(1), - "url": m.group(1).decode("UTF-8"), - } - ) - return spans + + class _url_replacer: + def __init__(self): + self.spans = [] + self.shortenedby = 0 + + def replace(self, match: bytes): + if len(match.group(2)) <= url_shortened_len: + url = match.group(2) + else: + url = match.group(2)[:url_shortened_len - 3] + b'...' + end = match.start(2) + len(url) + self.spans.append({ + 'start': match.start(2) - self.shortenedby, + 'end': end - self.shortenedby, + 'url': match.group(2), + }) + self.shortenedby += len(match.group(2)) - len(url) + return match.group(1) + url + + replacer = _url_replacer() + r, n = url_regex.subn(replacer.replace, text_bytes) + + return r.decode('UTF-8'), replacer.spans def _parse_mentions(self, text: str): spans = [] @@ -287,7 +303,9 @@ class Bluesky(object): "features": [{"$type": "app.bsky.richtext.facet#mention", "did": did}], } ) - for u in self._parse_urls(text): + newtext, urls = self._parse_urls(text) + + for u in urls: facets.append( { "index": { @@ -303,4 +321,4 @@ class Bluesky(object): ], } ) - return facets + return newtext, facets diff --git a/postgresqleu/util/messaging/short.py b/postgresqleu/util/messaging/short.py index 7b54528b..966a7de0 100644 --- a/postgresqleu/util/messaging/short.py +++ b/postgresqleu/util/messaging/short.py @@ -10,8 +10,8 @@ _re_urlmatcher = re.compile(r'\bhttps?://\S+', re.I) # This is currently the value for Twitter and the default for Mastodon, so just # use that globally for now. -_url_shortened_len = 23 -_url_counts_as_characters = "https://short.url/{}".format((_url_shortened_len - len("https://short.url/")) * 'x') +url_shortened_len = 23 +_url_counts_as_characters = "https://short.url/{}".format((url_shortened_len - len("https://short.url/")) * 'x') def get_shortened_post_length(txt): @@ -28,12 +28,12 @@ def truncate_shortened_post(txt, maxlen): return txt[:maxlen] firststart, firstend = matches[0].span() - if firststart + _url_shortened_len > maxlen: + if firststart + url_shortened_len > maxlen: # We hit the size limit before the url or in the middle of it, so skip the whole url return txt[:firststart] inlen = firstend - outlen = firststart + _url_shortened_len + outlen = firststart + url_shortened_len for i, curr in enumerate(matches[1:]): prevstart, prevend = matches[i].span() currstart, currend = curr.span() @@ -43,13 +43,13 @@ def truncate_shortened_post(txt, maxlen): # The limit was hit in the text between urls left = maxlen - outlen return txt[:inlen + (maxlen - outlen)] - if outlen + betweenlen + _url_shortened_len > maxlen: + if outlen + betweenlen + url_shortened_len > maxlen: # The limit was hit in the middle of this URL, so include all the text # up to it, but skip the url. return txt[:inlen + betweenlen] # The whole URL fit inlen += betweenlen + currend - currstart - outlen += betweenlen + _url_shortened_len + outlen += betweenlen + url_shortened_len return txt[:inlen + (maxlen - outlen)] -- 2.39.5