From 24b3fdb4f34f85be090c335426e41403331e3ddf Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 14:08:42 +0200 Subject: [PATCH 1/6] Add support for pull request reviews Closes #124 --- CHANGES.rst | 2 + README.rst | 16 ++- github_backup/github_backup.py | 148 ++++++++++++++++++-- tests/test_pull_reviews.py | 237 +++++++++++++++++++++++++++++++++ 4 files changed, 388 insertions(+), 15 deletions(-) create mode 100644 tests/test_pull_reviews.py diff --git a/CHANGES.rst b/CHANGES.rst index 50f8d54..b790ce1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,6 +5,8 @@ Unreleased ---------- - Add GitHub Discussions backups via GraphQL, including comments, replies, optional attachment downloads, and per-repository incremental checkpoints. +- Add pull request review backups with ``--pull-reviews`` and one-time + incremental backfill for existing backups. - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/README.rst b/README.rst index 4135743..52d7222 100644 --- a/README.rst +++ b/README.rst @@ -42,7 +42,8 @@ CLI Help output:: [--starred] [--all-starred] [--starred-skip-size-over MB] [--watched] [--followers] [--following] [--all] [--issues] [--issue-comments] [--issue-events] [--pulls] - [--pull-comments] [--pull-commits] [--pull-details] + [--pull-comments] [--pull-reviews] [--pull-commits] + [--pull-details] [--labels] [--hooks] [--milestones] [--security-advisories] [--discussions] [--repositories] [--bare] [--no-prune] [--lfs] [--wikis] [--gists] [--starred-gists] @@ -97,6 +98,7 @@ CLI Help output:: --issue-events include issue events in backup --pulls include pull requests in backup --pull-comments include pull request review comments in backup + --pull-reviews include pull request reviews in backup --pull-commits include pull request commits in backup --pull-details include more pull request details in backup [*] --labels include labels in backup @@ -340,6 +342,14 @@ For finer control, avoid using ``--assets`` with starred repos, or use ``--skip- Alternatively, consider just storing links to starred repos in JSON format with ``--starred``. +About pull request reviews +-------------------------- + +Use ``--pull-reviews`` with ``--pulls`` to include GitHub pull request review metadata under each pull request's ``review_data`` key. Reviews are separate from review comments: ``--pull-comments`` backs up inline review comments via ``comment_data`` and regular PR conversation comments via ``comment_regular_data``, while ``--pull-reviews`` backs up review state, submitted time, commit ID, and the top-level review body. + +``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing repository checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added. + + Incremental Backup ------------------ @@ -431,14 +441,14 @@ Quietly and incrementally backup useful Github user data (public and private rep export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER - github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER + github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER Debug an error/block or incomplete backup into a temporary directory. Omit "incremental" to fill a previous incomplete backup. :: export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER - github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER + github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER Pipe a token from stdin to avoid storing it in environment variables or command history (Unix-like systems only):: diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index c1245bd..054d0c6 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -293,6 +293,12 @@ def parse_args(args=None): dest="include_pull_comments", help="include pull request review comments in backup", ) + parser.add_argument( + "--pull-reviews", + action="store_true", + dest="include_pull_reviews", + help="include pull request reviews in backup", + ) parser.add_argument( "--pull-commits", action="store_true", @@ -2427,6 +2433,57 @@ def backup_issues(args, repo_cwd, repository, repos_template): os.replace(issue_file + ".temp", issue_file) # Atomic write +PULL_OPTIONAL_DATA_KEYS = ( + "comment_regular_data", + "comment_data", + "commit_data", + "review_data", +) +PULL_REVIEWS_LAST_UPDATE_FILENAME = "reviews_last_update" + + +def read_json_file_if_exists(path): + if not os.path.isfile(path): + return None + + try: + with codecs.open(path, "r", encoding="utf-8") as f: + return json.load(f) + except (OSError, UnicodeDecodeError, json.decoder.JSONDecodeError) as e: + logger.debug("Error reading existing JSON file {0}: {1}".format(path, e)) + return None + + +def restore_existing_pull_optional_data(pull, existing_pull): + if not existing_pull: + return + + for key in PULL_OPTIONAL_DATA_KEYS: + if key not in pull and key in existing_pull: + pull[key] = existing_pull[key] + + +def get_pull_reviews_since(args, pulls_cwd): + args_since = getattr(args, "since", None) + if not args.incremental: + return args_since, None, None + + reviews_last_update_path = os.path.join( + pulls_cwd, PULL_REVIEWS_LAST_UPDATE_FILENAME + ) + if not os.path.exists(reviews_last_update_path): + # One-time backfill for existing incremental backups: if the user adds + # --pull-reviews after a repository checkpoint already exists, the + # repository-level checkpoint would otherwise skip old PRs forever. + return None, None, reviews_last_update_path + + reviews_since = open(reviews_last_update_path).read().strip() + if args_since and reviews_since: + return min(args_since, reviews_since), reviews_since, reviews_last_update_path + + return args_since or reviews_since, reviews_since, reviews_last_update_path + + def backup_pulls(args, repo_cwd, repository, repos_template): has_pulls_dir = os.path.isdir("{0}/pulls/.git".format(repo_cwd)) if args.skip_existing and has_pulls_dir: @@ -2436,7 +2493,20 @@ def backup_pulls(args, repo_cwd, repository, repos_template): pulls_cwd = os.path.join(repo_cwd, "pulls") mkdir_p(repo_cwd, pulls_cwd) + include_pull_reviews = args.include_pull_reviews or args.include_everything + repository_since = getattr(args, "since", None) + pulls_since = repository_since + pull_reviews_since = None + pull_reviews_last_update_path = None + if include_pull_reviews: + ( + pulls_since, + pull_reviews_since, + pull_reviews_last_update_path, + ) = get_pull_reviews_since(args, pulls_cwd) + pulls = {} + newest_pull_update = None _pulls_template = "{0}/{1}/pulls".format(repos_template, repository["full_name"]) _issue_template = "{0}/{1}/issues".format(repos_template, repository["full_name"]) query_args = { @@ -2446,27 +2516,43 @@ def backup_pulls(args, repo_cwd, repository, repos_template): "direction": "desc", } + def track_newest_pull_update(pull): + nonlocal newest_pull_update + updated_at = pull.get("updated_at") + if updated_at and ( + newest_pull_update is None or updated_at > newest_pull_update + ): + newest_pull_update = updated_at + + def pull_is_due_for_repository_checkpoint(pull): + return not repository_since or pull["updated_at"] >= repository_since + if not args.include_pull_details: pull_states = ["open", "closed"] for pull_state in pull_states: query_args["state"] = pull_state _pulls = retrieve_data(args, _pulls_template, query_args=query_args) for pull in _pulls: - if args.since and pull["updated_at"] < args.since: + track_newest_pull_update(pull) + if pulls_since and pull["updated_at"] < pulls_since: break - if not args.since or pull["updated_at"] >= args.since: + if not pulls_since or pull["updated_at"] >= pulls_since: pulls[pull["number"]] = pull else: _pulls = retrieve_data(args, _pulls_template, query_args=query_args) for pull in _pulls: - if args.since and pull["updated_at"] < args.since: + track_newest_pull_update(pull) + if pulls_since and pull["updated_at"] < pulls_since: break - if not args.since or pull["updated_at"] >= args.since: - pulls[pull["number"]] = retrieve_data( - args, - _pulls_template + "/{}".format(pull["number"]), - paginated=False, - )[0] + if not pulls_since or pull["updated_at"] >= pulls_since: + if pull_is_due_for_repository_checkpoint(pull): + pulls[pull["number"]] = retrieve_data( + args, + _pulls_template + "/{}".format(pull["number"]), + paginated=False, + )[0] + else: + pulls[pull["number"]] = pull logger.info("Saving {0} pull requests to disk".format(len(list(pulls.keys())))) # Comments from pulls API are only _review_ comments @@ -2476,24 +2562,50 @@ def backup_pulls(args, repo_cwd, repository, repos_template): comments_regular_template = _issue_template + "/{0}/comments" comments_template = _pulls_template + "/{0}/comments" commits_template = _pulls_template + "/{0}/commits" + reviews_template = _pulls_template + "/{0}/reviews" + pull_review_errors = False + for number, pull in list(pulls.items()): pull_file = "{0}/{1}.json".format(pulls_cwd, number) + existing_pull = read_json_file_if_exists(pull_file) + needs_review_backfill = ( + include_pull_reviews + and (not existing_pull or "review_data" not in existing_pull) + ) + if args.incremental_by_files and os.path.isfile(pull_file): modified = os.path.getmtime(pull_file) modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ") - if modified > pull["updated_at"]: + if modified > pull["updated_at"] and not needs_review_backfill: logger.info( "Skipping pull request {0} because it wasn't modified since last backup".format( number ) ) continue - if args.include_pull_comments or args.include_everything: + + should_fetch_non_review_data = pull_is_due_for_repository_checkpoint(pull) + if ( + args.include_pull_comments or args.include_everything + ) and should_fetch_non_review_data: template = comments_regular_template.format(number) pulls[number]["comment_regular_data"] = retrieve_data(args, template) template = comments_template.format(number) pulls[number]["comment_data"] = retrieve_data(args, template) - if args.include_pull_commits or args.include_everything: + if include_pull_reviews: + template = reviews_template.format(number) + try: + pulls[number]["review_data"] = retrieve_data(args, template) + except Exception as e: + pull_review_errors = True + logger.warning( + "Unable to retrieve reviews for pull request {0}#{1}, skipping reviews: {2}".format( + repository["full_name"], number, e + ) + ) + if ( + args.include_pull_commits or args.include_everything + ) and should_fetch_non_review_data: template = commits_template.format(number) pulls[number]["commit_data"] = retrieve_data(args, template) if args.include_attachments: @@ -2501,10 +2613,22 @@ def backup_pulls(args, repo_cwd, repository, repos_template): args, pulls_cwd, pulls[number], number, repository, item_type="pull" ) + restore_existing_pull_optional_data(pull, existing_pull) + with codecs.open(pull_file + ".temp", "w", encoding="utf-8") as f: json_dump(pull, f) os.replace(pull_file + ".temp", pull_file) # Atomic write + if ( + include_pull_reviews + and args.incremental + and pull_reviews_last_update_path + and newest_pull_update + and not pull_review_errors + and (not pull_reviews_since or newest_pull_update > pull_reviews_since) + ): + open(pull_reviews_last_update_path, "w").write(newest_pull_update) + def backup_milestones(args, repo_cwd, repository, repos_template): milestone_cwd = os.path.join(repo_cwd, "milestones") diff --git a/tests/test_pull_reviews.py b/tests/test_pull_reviews.py new file mode 100644 index 0000000..6130269 --- /dev/null +++ b/tests/test_pull_reviews.py @@ -0,0 +1,237 @@ +"""Tests for pull request review backups.""" + +import json +import os + +from github_backup import github_backup + + +def test_parse_args_pull_reviews_flag(): + args = github_backup.parse_args(["--pull-reviews", "testuser"]) + assert args.include_pull_reviews is True + + +def test_backup_pulls_includes_review_data(create_args, tmp_path, monkeypatch): + args = create_args(include_pulls=True, include_pull_reviews=True) + repository = {"full_name": "owner/repo"} + calls = [] + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + calls.append((template, query_args)) + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2026-02-01T00:00:00Z", + "title": "Add feature", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [ + { + "id": 123, + "state": "APPROVED", + "body": "Looks good", + "submitted_at": "2026-02-01T00:00:00Z", + } + ] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(tmp_path / "pulls" / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [ + { + "body": "Looks good", + "id": 123, + "state": "APPROVED", + "submitted_at": "2026-02-01T00:00:00Z", + } + ] + assert ( + "https://api.github.com/repos/owner/repo/pulls/1/reviews", + None, + ) in calls + + +def test_pull_reviews_backfill_ignores_repository_checkpoint( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-01-01T00:00:00Z", + "title": "Old pull request", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "APPROVED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(tmp_path / "pulls" / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [{"id": 123, "state": "APPROVED"}] + assert (tmp_path / "pulls" / "reviews_last_update").read_text() == ( + "2025-01-01T00:00:00Z" + ) + + +def test_pull_reviews_uses_review_checkpoint_when_older_than_repository_checkpoint( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-06-01T00:00:00Z", + "title": "Review changed while feature was disabled", + }, + { + "number": 2, + "updated_at": "2024-12-01T00:00:00Z", + "title": "Too old", + }, + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "COMMENTED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert os.path.exists(tmp_path / "pulls" / "1.json") + assert not os.path.exists(tmp_path / "pulls" / "2.json") + assert (tmp_path / "pulls" / "reviews_last_update").read_text() == ( + "2025-06-01T00:00:00Z" + ) + + +def test_pull_reviews_preserves_existing_optional_pull_data( + create_args, tmp_path, monkeypatch +): + args = create_args(include_pulls=True, include_pull_reviews=True) + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + with open(pulls_dir / "1.json", "w", encoding="utf-8") as f: + json.dump( + { + "number": 1, + "updated_at": "2026-01-01T00:00:00Z", + "comment_data": [{"id": 10, "body": "inline comment"}], + "comment_regular_data": [{"id": 11, "body": "regular comment"}], + "commit_data": [{"sha": "abc"}], + }, + f, + ) + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2026-02-01T00:00:00Z", + "title": "Add reviews", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "APPROVED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(pulls_dir / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [{"id": 123, "state": "APPROVED"}] + assert pull["comment_data"] == [{"id": 10, "body": "inline comment"}] + assert pull["comment_regular_data"] == [{"id": 11, "body": "regular comment"}] + assert pull["commit_data"] == [{"sha": "abc"}] + + +def test_pull_reviews_does_not_advance_checkpoint_on_review_error( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-06-01T00:00:00Z", + "title": "Review retrieval fails", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + raise Exception("temporary API failure") + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert (pulls_dir / "reviews_last_update").read_text() == "2025-01-01T00:00:00Z" From b3a8241c9ab5930acfae2014d6a48a4feabe95ae Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 15:03:48 +0200 Subject: [PATCH 2/6] Implement per-resource last_update timestamps Closes #62 --- CHANGES.rst | 5 + README.rst | 12 +- github_backup/github_backup.py | 167 +++++++++++++++++--- tests/test_incremental_per_repository.py | 189 +++++++++++++++++++++++ 4 files changed, 348 insertions(+), 25 deletions(-) create mode 100644 tests/test_incremental_per_repository.py diff --git a/CHANGES.rst b/CHANGES.rst index b790ce1..6cf9f17 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -7,6 +7,11 @@ Unreleased optional attachment downloads, and per-repository incremental checkpoints. - Add pull request review backups with ``--pull-reviews`` and one-time incremental backfill for existing backups. +- Store incremental ``last_update`` checkpoints per repository resource instead + of using one global checkpoint for the whole output directory. Existing + backups use the legacy global checkpoint as a migration fallback, and the + legacy file is removed once existing issue/pull backups have resource + checkpoints (#62). - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/README.rst b/README.rst index 52d7222..3a4be3b 100644 --- a/README.rst +++ b/README.rst @@ -347,15 +347,19 @@ About pull request reviews Use ``--pull-reviews`` with ``--pulls`` to include GitHub pull request review metadata under each pull request's ``review_data`` key. Reviews are separate from review comments: ``--pull-comments`` backs up inline review comments via ``comment_data`` and regular PR conversation comments via ``comment_regular_data``, while ``--pull-reviews`` backs up review state, submitted time, commit ID, and the top-level review body. -``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing repository checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added. +``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing pull request checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added. Incremental Backup ------------------ -Using (``-i, --incremental``) will only request new data from the API **since the last run (successful or not)**. e.g. only request issues from the API since the last run. +Using (``-i, --incremental``) will only request new data from the API **since the last successful resource backup**. e.g. only request issues from the API since the last issue backup for that repository. -This means any blocking errors on previous runs can cause a large amount of missing data in backups. +Incremental checkpoints for issue and pull request API backups are stored per resource in that repository's backup directory (for example ``repositories/{repo}/issues/last_update``, ``repositories/{repo}/pulls/last_update`` or ``starred/{owner}/{repo}/pulls/last_update``). Older versions stored a single global ``last_update`` file in the output directory root. During migration, the legacy global checkpoint is used as a fallback only for resource directories that already contain backup data but do not yet have their own checkpoint. New repositories or newly enabled resources with no existing data get a full backup instead of inheriting an unrelated global checkpoint. + +After all existing issue and pull request resource directories have per-resource checkpoints, the legacy global ``last_update`` file is removed automatically. + +This means any blocking errors on previous runs can cause missing data in backups for the affected repository resource. Using (``--incremental-by-files``) will request new data from the API **based on when the file was modified on filesystem**. e.g. if you modify the file yourself you may miss something. @@ -368,7 +372,7 @@ Known blocking errors Some errors will block the backup run by exiting the script. e.g. receiving a 403 Forbidden error from the Github API. -If the incremental argument is used, this will result in the next backup only requesting API data since the last blocked/failed run. Potentially causing unexpected large amounts of missing data. +If the incremental argument is used, per-resource checkpoints are only advanced after that resource's backup work completes. A blocking error can still abort the overall run, but repositories and resources that were not processed will keep their previous checkpoints. It's therefore recommended to only use the incremental argument if the output/result is being actively monitored, or complimented with periodic full non-incremental runs, to avoid unexpected missing data in a regular backup runs. diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 054d0c6..e56bb28 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -1928,26 +1928,138 @@ def filter_repositories(args, unfiltered_repositories): return repositories +INCREMENTAL_LAST_UPDATE_FILENAME = "last_update" +INCREMENTAL_RESOURCE_DIRECTORIES = ("issues", "pulls") + + +def get_repository_checkpoint_time(repository): + timestamps = [ + timestamp + for timestamp in (repository.get("updated_at"), repository.get("pushed_at")) + if timestamp + ] + if timestamps: + return max(timestamps) + + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) + + +def resource_backup_exists(resource_cwd): + if not os.path.isdir(resource_cwd): + return False + + ignored_names = { + INCREMENTAL_LAST_UPDATE_FILENAME, + PULL_REVIEWS_LAST_UPDATE_FILENAME, + } + for name in os.listdir(resource_cwd): + if name in ignored_names or name.endswith(".temp"): + continue + return True + + return False + + +def read_legacy_last_update(args, output_directory): + if not args.incremental: + return None, None + + last_update_path = os.path.join(output_directory, INCREMENTAL_LAST_UPDATE_FILENAME) + if os.path.exists(last_update_path): + return last_update_path, open(last_update_path).read().strip() + + return last_update_path, None + + +def read_resource_last_update(args, resource_cwd, legacy_last_update=None): + if not args.incremental: + return None + + last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME) + if os.path.exists(last_update_path): + return open(last_update_path).read().strip() + + if legacy_last_update and resource_backup_exists(resource_cwd): + return legacy_last_update + + return None + + +def write_resource_last_update(args, resource_cwd, repository): + if not args.incremental: + return + + mkdir_p(resource_cwd) + last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME) + open(last_update_path, "w").write(get_repository_checkpoint_time(repository)) + + +def iter_incremental_resource_dirs(output_directory): + repositories_dir = os.path.join(output_directory, "repositories") + if os.path.isdir(repositories_dir): + for repository_name in os.listdir(repositories_dir): + repo_cwd = os.path.join(repositories_dir, repository_name) + if not os.path.isdir(repo_cwd): + continue + for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES: + yield os.path.join(repo_cwd, resource_name) + + starred_dir = os.path.join(output_directory, "starred") + if os.path.isdir(starred_dir): + for owner_name in os.listdir(starred_dir): + owner_cwd = os.path.join(starred_dir, owner_name) + if not os.path.isdir(owner_cwd): + continue + for repository_name in os.listdir(owner_cwd): + repo_cwd = os.path.join(owner_cwd, repository_name) + if not os.path.isdir(repo_cwd): + continue + for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES: + yield os.path.join(repo_cwd, resource_name) + + +def has_unmigrated_incremental_resources(output_directory): + for resource_cwd in iter_incremental_resource_dirs(output_directory): + last_update_path = os.path.join( + resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME + ) + if resource_backup_exists(resource_cwd) and not os.path.exists( + last_update_path + ): + return True + + return False + + +def remove_legacy_last_update_if_migrated( + args, output_directory, legacy_last_update_path +): + if not args.incremental or not legacy_last_update_path: + return + if not os.path.exists(legacy_last_update_path): + return + if has_unmigrated_incremental_resources(output_directory): + logger.info( + "Keeping legacy global last_update until all existing issue/pull " + "backups have per-resource checkpoints" + ) + return + + os.remove(legacy_last_update_path) + logger.info( + "Removed legacy global last_update after migrating incremental checkpoints" + ) + + def backup_repositories(args, output_directory, repositories): logger.info("Backing up repositories") repos_template = "https://{0}/repos".format(get_github_api_host(args)) + legacy_last_update_path, legacy_last_update = read_legacy_last_update( + args, output_directory + ) + incremental_resource_work_attempted = False - if args.incremental: - last_update_path = os.path.join(output_directory, "last_update") - if os.path.exists(last_update_path): - args.since = open(last_update_path).read().strip() - else: - args.since = None - else: - args.since = None - - last_update = "0000-00-00T00:00:00Z" for repository in repositories: - if repository.get("updated_at") and repository["updated_at"] > last_update: - last_update = repository["updated_at"] - elif repository.get("pushed_at") and repository["pushed_at"] > last_update: - last_update = repository["pushed_at"] - if repository.get("is_gist"): repo_cwd = os.path.join(output_directory, "gists", repository["id"]) elif repository.get("is_starred"): @@ -2010,10 +2122,22 @@ def backup_repositories(args, output_directory, repositories): no_prune=args.no_prune, ) if args.include_issues or args.include_everything: + incremental_resource_work_attempted = True + issue_cwd = os.path.join(repo_cwd, "issues") + args.since = read_resource_last_update( + args, issue_cwd, legacy_last_update + ) backup_issues(args, repo_cwd, repository, repos_template) + write_resource_last_update(args, issue_cwd, repository) if args.include_pulls or args.include_everything: + incremental_resource_work_attempted = True + pulls_cwd = os.path.join(repo_cwd, "pulls") + args.since = read_resource_last_update( + args, pulls_cwd, legacy_last_update + ) backup_pulls(args, repo_cwd, repository, repos_template) + write_resource_last_update(args, pulls_cwd, repository) if args.include_discussions or args.include_everything: backup_discussions(args, repo_cwd, repository) @@ -2021,7 +2145,9 @@ def backup_repositories(args, output_directory, repositories): if args.include_milestones or args.include_everything: backup_milestones(args, repo_cwd, repository, repos_template) - if args.include_security_advisories or (args.include_everything and not repository.get("private", False)): + if args.include_security_advisories or ( + args.include_everything and not repository.get("private", False) + ): backup_security_advisories(args, repo_cwd, repository, repos_template) if args.include_labels or args.include_everything: @@ -2045,11 +2171,10 @@ def backup_repositories(args, output_directory, repositories): logger.info(f"Skipping remaining resources for {repository['full_name']}") continue - if args.incremental: - if last_update == "0000-00-00T00:00:00Z": - last_update = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) - - open(last_update_path, "w").write(last_update) + if incremental_resource_work_attempted: + remove_legacy_last_update_if_migrated( + args, output_directory, legacy_last_update_path + ) def _repository_owner_name(repository): diff --git a/tests/test_incremental_per_repository.py b/tests/test_incremental_per_repository.py new file mode 100644 index 0000000..f1fd67a --- /dev/null +++ b/tests/test_incremental_per_repository.py @@ -0,0 +1,189 @@ +"""Tests for per-resource incremental checkpoints.""" + +import json +import os + +from github_backup import github_backup + + +def _repo(name, updated_at, pushed_at=None): + return { + "name": name, + "full_name": "owner/{0}".format(name), + "owner": {"login": "owner"}, + "clone_url": "https://github.com/owner/{0}.git".format(name), + "private": False, + "fork": False, + "has_wiki": False, + "updated_at": updated_at, + "pushed_at": pushed_at, + } + + +def test_incremental_uses_per_resource_last_update( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repositories = [ + _repo("repo-one", "2026-02-01T00:00:00Z"), + _repo("repo-two", "2026-03-01T00:00:00Z"), + ] + repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues" + repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues" + repo_one_issues.mkdir(parents=True) + repo_two_issues.mkdir(parents=True) + (repo_one_issues / "last_update").write_text("2026-01-01T00:00:00Z") + (repo_two_issues / "last_update").write_text("2025-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append((repository["name"], passed_args.since)) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, repositories) + + assert seen_since == [ + ("repo-one", "2026-01-01T00:00:00Z"), + ("repo-two", "2025-01-01T00:00:00Z"), + ] + assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert (repo_two_issues / "last_update").read_text() == "2026-03-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_uses_independent_issue_and_pull_checkpoints( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True, include_pulls=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + repo_dir = tmp_path / "repositories" / "repo-one" + issues_dir = repo_dir / "issues" + pulls_dir = repo_dir / "pulls" + issues_dir.mkdir(parents=True) + pulls_dir.mkdir(parents=True) + (issues_dir / "last_update").write_text("2026-01-01T00:00:00Z") + (pulls_dir / "last_update").write_text("2025-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(("issues", passed_args.since)) + + def fake_backup_pulls(passed_args, repo_cwd, repository, repos_template): + seen_since.append(("pulls", passed_args.since)) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + monkeypatch.setattr(github_backup, "backup_pulls", fake_backup_pulls) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == [ + ("issues", "2026-01-01T00:00:00Z"), + ("pulls", "2025-01-01T00:00:00Z"), + ] + assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert (pulls_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + + +def test_incremental_uses_legacy_global_last_update_for_existing_resource_backup( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + issues_dir = tmp_path / "repositories" / "repo-one" / "issues" + issues_dir.mkdir(parents=True) + with open(issues_dir / "1.json", "w", encoding="utf-8") as f: + json.dump({"number": 1}, f) + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(passed_args.since) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == ["2026-01-01T00:00:00Z"] + assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_does_not_use_legacy_global_last_update_for_new_resource_backup( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2099-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(passed_args.since) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == [None] + assert ( + tmp_path / "repositories" / "repo-one" / "issues" / "last_update" + ).read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_keeps_legacy_global_last_update_until_all_existing_resources_migrated( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues" + repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues" + repo_one_issues.mkdir(parents=True) + repo_two_issues.mkdir(parents=True) + with open(repo_one_issues / "1.json", "w", encoding="utf-8") as f: + json.dump({"number": 1}, f) + with open(repo_two_issues / "2.json", "w", encoding="utf-8") as f: + json.dump({"number": 2}, f) + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + pass + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(repo_two_issues / "last_update") + assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z" + + +def test_incremental_does_not_remove_legacy_checkpoint_without_resource_work( + create_args, tmp_path +): + args = create_args(incremental=True, include_repository=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z" + assert not os.path.exists( + tmp_path / "repositories" / "repo-one" / "issues" / "last_update" + ) + + +def test_repository_checkpoint_time_uses_newest_available_repo_timestamp(): + repository = _repo( + "repo-one", + updated_at="2026-02-01T00:00:00Z", + pushed_at="2026-03-01T00:00:00Z", + ) + + assert github_backup.get_repository_checkpoint_time(repository) == ( + "2026-03-01T00:00:00Z" + ) From 6cd0ab3633df812ab586968b5b2e448e0e1b3efc Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 15:15:22 +0200 Subject: [PATCH 3/6] Reduce unnecessary pull requests with incremental fetching --- CHANGES.rst | 2 + github_backup/github_backup.py | 18 +++-- tests/test_pull_incremental_pagination.py | 85 +++++++++++++++++++++++ tests/test_pull_reviews.py | 10 +-- 4 files changed, 104 insertions(+), 11 deletions(-) create mode 100644 tests/test_pull_incremental_pagination.py diff --git a/CHANGES.rst b/CHANGES.rst index 6cf9f17..8b62d33 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -12,6 +12,8 @@ Unreleased backups use the legacy global checkpoint as a migration fallback, and the legacy file is removed once existing issue/pull backups have resource checkpoints (#62). +- Stop paginating pull requests during incremental backups once the sorted + results are older than the active checkpoint. - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index e56bb28..f83bdb3 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -717,11 +717,12 @@ def calculate_retry_delay(attempt, headers): return delay + random.uniform(0, delay * 0.1) -def retrieve_data(args, template, query_args=None, paginated=True): +def retrieve_data(args, template, query_args=None, paginated=True, lazy=False): """ Fetch the data from GitHub API. - Handle both single requests and pagination with yield of individual dicts. + Handle both single requests and pagination. Returns a list by default, or + a generator when lazy=True so callers can stop before fetching every page. Handles throttling, retries, read errors, and DMCA takedowns. """ query_args = query_args or {} @@ -851,6 +852,9 @@ def _extract_legal_url(response_body_bytes): ): break # No more data + if lazy: + return fetch_all() + return list(fetch_all()) @@ -2656,16 +2660,18 @@ def pull_is_due_for_repository_checkpoint(pull): pull_states = ["open", "closed"] for pull_state in pull_states: query_args["state"] = pull_state - _pulls = retrieve_data(args, _pulls_template, query_args=query_args) - for pull in _pulls: + for pull in retrieve_data( + args, _pulls_template, query_args=query_args, lazy=True + ): track_newest_pull_update(pull) if pulls_since and pull["updated_at"] < pulls_since: break if not pulls_since or pull["updated_at"] >= pulls_since: pulls[pull["number"]] = pull else: - _pulls = retrieve_data(args, _pulls_template, query_args=query_args) - for pull in _pulls: + for pull in retrieve_data( + args, _pulls_template, query_args=query_args, lazy=True + ): track_newest_pull_update(pull) if pulls_since and pull["updated_at"] < pulls_since: break diff --git a/tests/test_pull_incremental_pagination.py b/tests/test_pull_incremental_pagination.py new file mode 100644 index 0000000..11230b0 --- /dev/null +++ b/tests/test_pull_incremental_pagination.py @@ -0,0 +1,85 @@ +"""Tests for incremental pull request pagination.""" + +import json +import os +from unittest.mock import patch + +from github_backup import github_backup + + +class MockHTTPResponse: + def __init__(self, data, link_header=None): + self._content = json.dumps(data).encode("utf-8") + self._link_header = link_header + self._read = False + self.reason = "OK" + + def getcode(self): + return 200 + + def read(self): + if self._read: + return b"" + self._read = True + return self._content + + @property + def headers(self): + headers = {"x-ratelimit-remaining": "5000"} + if self._link_header: + headers["Link"] = self._link_header + return headers + + +def test_backup_pulls_incremental_stops_before_fetching_old_pages( + create_args, tmp_path +): + args = create_args(include_pulls=True, incremental=True) + args.since = "2026-04-26T08:13:46Z" + repository = {"full_name": "owner/repo"} + + responses = [ + MockHTTPResponse([]), + MockHTTPResponse( + [ + { + "number": 2, + "title": "new pull", + "updated_at": "2026-04-26T09:00:00Z", + }, + { + "number": 1, + "title": "old pull", + "updated_at": "2026-04-26T07:00:00Z", + }, + ], + link_header='; rel="next"', + ), + MockHTTPResponse( + [ + { + "number": 0, + "title": "older pull on page 2", + "updated_at": "2026-04-25T07:00:00Z", + } + ] + ), + ] + requests_made = [] + + def mock_urlopen(request, *args, **kwargs): + requests_made.append(request.get_full_url()) + return responses[len(requests_made) - 1] + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert len(requests_made) == 2 + assert "state=open" in requests_made[0] + assert "state=closed" in requests_made[1] + assert all("page=2" not in url for url in requests_made) + assert os.path.exists(tmp_path / "pulls" / "2.json") + assert not os.path.exists(tmp_path / "pulls" / "1.json") + assert not os.path.exists(tmp_path / "pulls" / "0.json") diff --git a/tests/test_pull_reviews.py b/tests/test_pull_reviews.py index 6130269..2ce9ad1 100644 --- a/tests/test_pull_reviews.py +++ b/tests/test_pull_reviews.py @@ -16,7 +16,7 @@ def test_backup_pulls_includes_review_data(create_args, tmp_path, monkeypatch): repository = {"full_name": "owner/repo"} calls = [] - def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): calls.append((template, query_args)) if template == "https://api.github.com/repos/owner/repo/pulls": if query_args["state"] == "open": @@ -73,7 +73,7 @@ def test_pull_reviews_backfill_ignores_repository_checkpoint( args.since = "2026-01-01T00:00:00Z" repository = {"full_name": "owner/repo"} - def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): if template == "https://api.github.com/repos/owner/repo/pulls": if query_args["state"] == "open": return [ @@ -117,7 +117,7 @@ def test_pull_reviews_uses_review_checkpoint_when_older_than_repository_checkpoi pulls_dir.mkdir() (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") - def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): if template == "https://api.github.com/repos/owner/repo/pulls": if query_args["state"] == "open": return [ @@ -169,7 +169,7 @@ def test_pull_reviews_preserves_existing_optional_pull_data( f, ) - def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): if template == "https://api.github.com/repos/owner/repo/pulls": if query_args["state"] == "open": return [ @@ -213,7 +213,7 @@ def test_pull_reviews_does_not_advance_checkpoint_on_review_error( pulls_dir.mkdir() (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") - def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): if template == "https://api.github.com/repos/owner/repo/pulls": if query_args["state"] == "open": return [ From 9d0cfdb61da1cea97b381c2177ccc4e52e9a6352 Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 16:05:20 +0200 Subject: [PATCH 4/6] Avoid redundant release asset list requests --- CHANGES.rst | 2 + github_backup/github_backup.py | 7 ++- tests/test_releases.py | 95 ++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 tests/test_releases.py diff --git a/CHANGES.rst b/CHANGES.rst index 8b62d33..3d2ceb0 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -14,6 +14,8 @@ Unreleased checkpoints (#62). - Stop paginating pull requests during incremental backups once the sorted results are older than the active checkpoint. +- Avoid extra release asset list requests by using asset metadata already + included in GitHub's releases response. - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index f83bdb3..6edfb05 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -2919,7 +2919,12 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F written_count += 1 if include_assets and not skip_assets: - assets = retrieve_data(args, release["assets_url"]) + # The releases list API already includes release asset metadata. Use + # it to avoid an extra /releases/{id}/assets request per release. + # Keep a fallback for older/enterprise responses that might omit it. + assets = release.get("assets") + if assets is None: + assets = retrieve_data(args, release["assets_url"]) if len(assets) > 0: # give release asset files somewhere to live & download them (not including source archives) release_assets_cwd = os.path.join(release_cwd, release_name_safe) diff --git a/tests/test_releases.py b/tests/test_releases.py new file mode 100644 index 0000000..b8584f4 --- /dev/null +++ b/tests/test_releases.py @@ -0,0 +1,95 @@ +"""Tests for release backup behavior.""" + +from github_backup import github_backup + + +def test_backup_releases_uses_embedded_assets_without_extra_asset_list_request( + create_args, tmp_path, monkeypatch +): + args = create_args(include_releases=True, include_assets=True) + repository = {"full_name": "owner/repo", "name": "repo"} + calls = [] + downloads = [] + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): + calls.append(template) + if template == "https://api.github.com/repos/owner/repo/releases": + return [ + { + "tag_name": "v1.0.0", + "created_at": "2026-01-01T00:00:00Z", + "updated_at": "2026-01-01T00:00:00Z", + "prerelease": False, + "draft": False, + "assets_url": "https://api.github.com/repos/owner/repo/releases/1/assets", + "assets": [ + { + "name": "artifact.zip", + "url": "https://api.github.com/repos/owner/repo/releases/assets/1", + } + ], + } + ] + raise AssertionError("Unexpected API request: {0}".format(template)) + + def fake_download_file(url, path, auth, as_app=False, fine=False): + downloads.append((url, path)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + monkeypatch.setattr(github_backup, "download_file", fake_download_file) + + github_backup.backup_releases( + args, + tmp_path, + repository, + "https://api.github.com/repos", + include_assets=True, + ) + + assert calls == ["https://api.github.com/repos/owner/repo/releases"] + assert downloads == [ + ( + "https://api.github.com/repos/owner/repo/releases/assets/1", + str(tmp_path / "releases" / "v1.0.0" / "artifact.zip"), + ) + ] + + +def test_backup_releases_falls_back_to_assets_url_when_assets_missing( + create_args, tmp_path, monkeypatch +): + args = create_args(include_releases=True, include_assets=True) + repository = {"full_name": "owner/repo", "name": "repo"} + calls = [] + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): + calls.append(template) + if template == "https://api.github.com/repos/owner/repo/releases": + return [ + { + "tag_name": "v1.0.0", + "created_at": "2026-01-01T00:00:00Z", + "updated_at": "2026-01-01T00:00:00Z", + "prerelease": False, + "draft": False, + "assets_url": "https://api.github.com/repos/owner/repo/releases/1/assets", + } + ] + if template == "https://api.github.com/repos/owner/repo/releases/1/assets": + return [] + raise AssertionError("Unexpected API request: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_releases( + args, + tmp_path, + repository, + "https://api.github.com/repos", + include_assets=True, + ) + + assert calls == [ + "https://api.github.com/repos/owner/repo/releases", + "https://api.github.com/repos/owner/repo/releases/1/assets", + ] From 014eff395a999f82674547efd77a6470b038ce91 Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 16:09:42 +0200 Subject: [PATCH 5/6] Skip checkpoint-equal incremental items --- CHANGES.rst | 4 +- github_backup/github_backup.py | 12 +++--- tests/test_discussions.py | 35 +++++++++++++++++ tests/test_pull_incremental_pagination.py | 46 +++++++++++++++++++++++ 4 files changed, 90 insertions(+), 7 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 3d2ceb0..3d4cdce 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -13,7 +13,9 @@ Unreleased legacy file is removed once existing issue/pull backups have resource checkpoints (#62). - Stop paginating pull requests during incremental backups once the sorted - results are older than the active checkpoint. + results are at or older than the active checkpoint. +- Avoid re-fetching discussions and pull requests whose ``updated_at`` exactly + matches the active incremental checkpoint. - Avoid extra release asset list requests by using asset metadata already included in GitHub's releases response. - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 6edfb05..ae4ef2e 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -2233,7 +2233,7 @@ def retrieve_discussion_summaries(args, repository, since=None): if updated_at and (newest_seen is None or updated_at > newest_seen): newest_seen = updated_at - if since and updated_at and updated_at < since: + if since and updated_at and updated_at <= since: stop = True break @@ -2654,7 +2654,7 @@ def track_newest_pull_update(pull): newest_pull_update = updated_at def pull_is_due_for_repository_checkpoint(pull): - return not repository_since or pull["updated_at"] >= repository_since + return not repository_since or pull["updated_at"] > repository_since if not args.include_pull_details: pull_states = ["open", "closed"] @@ -2664,18 +2664,18 @@ def pull_is_due_for_repository_checkpoint(pull): args, _pulls_template, query_args=query_args, lazy=True ): track_newest_pull_update(pull) - if pulls_since and pull["updated_at"] < pulls_since: + if pulls_since and pull["updated_at"] <= pulls_since: break - if not pulls_since or pull["updated_at"] >= pulls_since: + if not pulls_since or pull["updated_at"] > pulls_since: pulls[pull["number"]] = pull else: for pull in retrieve_data( args, _pulls_template, query_args=query_args, lazy=True ): track_newest_pull_update(pull) - if pulls_since and pull["updated_at"] < pulls_since: + if pulls_since and pull["updated_at"] <= pulls_since: break - if not pulls_since or pull["updated_at"] >= pulls_since: + if not pulls_since or pull["updated_at"] > pulls_since: if pull_is_due_for_repository_checkpoint(pull): pulls[pull["number"]] = retrieve_data( args, diff --git a/tests/test_discussions.py b/tests/test_discussions.py index 89fd8dd..2b5e3fb 100644 --- a/tests/test_discussions.py +++ b/tests/test_discussions.py @@ -50,6 +50,41 @@ def test_retrieve_discussion_summaries_stops_at_incremental_since(create_args): ) +def test_retrieve_discussion_summaries_excludes_checkpoint_timestamp(create_args): + args = create_args() + repository = {"full_name": "owner/repo"} + + page = { + "repository": { + "hasDiscussionsEnabled": True, + "discussions": { + "totalCount": 1, + "nodes": [ + { + "number": 1, + "title": "already backed up", + "updatedAt": "2026-01-01T00:00:00Z", + }, + ], + "pageInfo": {"hasNextPage": True, "endCursor": "NEXT"}, + }, + } + } + + with patch( + "github_backup.github_backup.retrieve_graphql_data", return_value=page + ) as mock_retrieve: + summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries( + args, repository, since="2026-01-01T00:00:00Z" + ) + + assert enabled is True + assert total == 1 + assert newest == "2026-01-01T00:00:00Z" + assert summaries == [] + assert mock_retrieve.call_count == 1 + + def test_retrieve_discussion_summaries_disabled_discussions(create_args): args = create_args() repository = {"full_name": "owner/repo"} diff --git a/tests/test_pull_incremental_pagination.py b/tests/test_pull_incremental_pagination.py index 11230b0..ac0f83f 100644 --- a/tests/test_pull_incremental_pagination.py +++ b/tests/test_pull_incremental_pagination.py @@ -31,6 +31,52 @@ def headers(self): return headers +def test_backup_pulls_incremental_excludes_checkpoint_timestamp(create_args, tmp_path): + args = create_args(include_pulls=True, incremental=True) + args.since = "2026-04-26T08:13:46Z" + repository = {"full_name": "owner/repo"} + + responses = [ + MockHTTPResponse([]), + MockHTTPResponse( + [ + { + "number": 1, + "title": "already backed up", + "updated_at": "2026-04-26T08:13:46Z", + }, + ], + link_header='; rel="next"', + ), + MockHTTPResponse( + [ + { + "number": 0, + "title": "older pull on page 2", + "updated_at": "2026-04-25T07:00:00Z", + } + ] + ), + ] + requests_made = [] + + def mock_urlopen(request, *args, **kwargs): + requests_made.append(request.get_full_url()) + return responses[len(requests_made) - 1] + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert len(requests_made) == 2 + assert "state=open" in requests_made[0] + assert "state=closed" in requests_made[1] + assert all("page=2" not in url for url in requests_made) + assert not os.path.exists(tmp_path / "pulls" / "1.json") + assert not os.path.exists(tmp_path / "pulls" / "0.json") + + def test_backup_pulls_incremental_stops_before_fetching_old_pages( create_args, tmp_path ): From f8cdf55050770bbcb1b5ba178d73b346988f0f89 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 29 Apr 2026 12:10:11 +0000 Subject: [PATCH 6/6] Release version 0.62.0 --- CHANGES.rst | 172 +++++++++++++++++++++++++++++++++----- github_backup/__init__.py | 2 +- 2 files changed, 154 insertions(+), 20 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 3d4cdce..86bcb32 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,29 +1,163 @@ Changelog ========= -Unreleased ----------- -- Add GitHub Discussions backups via GraphQL, including comments, replies, - optional attachment downloads, and per-repository incremental checkpoints. -- Add pull request review backups with ``--pull-reviews`` and one-time - incremental backfill for existing backups. -- Store incremental ``last_update`` checkpoints per repository resource instead - of using one global checkpoint for the whole output directory. Existing - backups use the legacy global checkpoint as a migration fallback, and the - legacy file is removed once existing issue/pull backups have resource - checkpoints (#62). -- Stop paginating pull requests during incremental backups once the sorted - results are at or older than the active checkpoint. -- Avoid re-fetching discussions and pull requests whose ``updated_at`` exactly - matches the active incremental checkpoint. -- Avoid extra release asset list requests by using asset metadata already - included in GitHub's releases response. -- Add ``--token-from-gh`` to read authentication from ``gh auth token``. +0.62.0 (2026-04-29) +------------------- +------------------------ +- Skip checkpoint-equal incremental items. [Duncan Ogilvie] +- Avoid redundant release asset list requests. [Duncan Ogilvie] +- Reduce unnecessary pull requests with incremental fetching. [Duncan + Ogilvie] +- Implement per-resource last_update timestamps. [Duncan Ogilvie] + + Closes #62 +- Add support for pull request reviews. [Duncan Ogilvie] + + Closes #124 +- Add support for discussions. [Duncan Ogilvie] + + Closes #290 +- Add --token-from-gh authentication option. [Duncan Ogilvie] +- Chore(deps): bump pytest in the python-packages group. + [dependabot[bot]] + + Bumps the python-packages group with 1 update: [pytest](https://github.com/pytest-dev/pytest). + + + Updates `pytest` from 9.0.2 to 9.0.3 + - [Release notes](https://github.com/pytest-dev/pytest/releases) + - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) + - [Commits](https://github.com/pytest-dev/pytest/compare/9.0.2...9.0.3) + + --- + updated-dependencies: + - dependency-name: pytest + dependency-version: 9.0.3 + dependency-type: direct:production + update-type: version-update:semver-patch + dependency-group: python-packages + ... +- Chore(deps): bump black in the python-packages group. + [dependabot[bot]] + + Bumps the python-packages group with 1 update: [black](https://github.com/psf/black). + + + Updates `black` from 26.3.0 to 26.3.1 + - [Release notes](https://github.com/psf/black/releases) + - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) + - [Commits](https://github.com/psf/black/compare/26.3.0...26.3.1) + + --- + updated-dependencies: + - dependency-name: black + dependency-version: 26.3.1 + dependency-type: direct:production + update-type: version-update:semver-patch + dependency-group: python-packages + ... +- Chore(deps): bump docker/login-action from 3 to 4. [dependabot[bot]] + + Bumps [docker/login-action](https://github.com/docker/login-action) from 3 to 4. + - [Release notes](https://github.com/docker/login-action/releases) + - [Commits](https://github.com/docker/login-action/compare/v3...v4) + + --- + updated-dependencies: + - dependency-name: docker/login-action + dependency-version: '4' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/setup-qemu-action from 3 to 4. + [dependabot[bot]] + + Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 3 to 4. + - [Release notes](https://github.com/docker/setup-qemu-action/releases) + - [Commits](https://github.com/docker/setup-qemu-action/compare/v3...v4) + + --- + updated-dependencies: + - dependency-name: docker/setup-qemu-action + dependency-version: '4' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/build-push-action from 6 to 7. + [dependabot[bot]] + + Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 6 to 7. + - [Release notes](https://github.com/docker/build-push-action/releases) + - [Commits](https://github.com/docker/build-push-action/compare/v6...v7) + + --- + updated-dependencies: + - dependency-name: docker/build-push-action + dependency-version: '7' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/setup-buildx-action from 3 to 4. + [dependabot[bot]] + + Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 3 to 4. + - [Release notes](https://github.com/docker/setup-buildx-action/releases) + - [Commits](https://github.com/docker/setup-buildx-action/compare/v3...v4) + + --- + updated-dependencies: + - dependency-name: docker/setup-buildx-action + dependency-version: '4' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/metadata-action from 5 to 6. + [dependabot[bot]] + + Bumps [docker/metadata-action](https://github.com/docker/metadata-action) from 5 to 6. + - [Release notes](https://github.com/docker/metadata-action/releases) + - [Commits](https://github.com/docker/metadata-action/compare/v5...v6) + + --- + updated-dependencies: + - dependency-name: docker/metadata-action + dependency-version: '6' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump the python-packages group with 2 updates. + [dependabot[bot]] + + Bumps the python-packages group with 2 updates: [black](https://github.com/psf/black) and [setuptools](https://github.com/pypa/setuptools). + + + Updates `black` from 26.1.0 to 26.3.0 + - [Release notes](https://github.com/psf/black/releases) + - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) + - [Commits](https://github.com/psf/black/compare/26.1.0...26.3.0) + + Updates `setuptools` from 82.0.0 to 82.0.1 + - [Release notes](https://github.com/pypa/setuptools/releases) + - [Changelog](https://github.com/pypa/setuptools/blob/main/NEWS.rst) + - [Commits](https://github.com/pypa/setuptools/compare/v82.0.0...v82.0.1) + + --- + updated-dependencies: + - dependency-name: black + dependency-version: 26.3.0 + dependency-type: direct:production + update-type: version-update:semver-minor + dependency-group: python-packages + - dependency-name: setuptools + dependency-version: 82.0.1 + dependency-type: direct:production + update-type: version-update:semver-patch + dependency-group: python-packages + ... 0.61.5 (2026-02-18) ------------------- ------------------------- - Fix empty repository crash due to None timestamp comparison (#489) [Rodos] diff --git a/github_backup/__init__.py b/github_backup/__init__.py index 294be4d..647040d 100644 --- a/github_backup/__init__.py +++ b/github_backup/__init__.py @@ -1 +1 @@ -__version__ = "0.61.5" +__version__ = "0.62.0"