From f54a5458f6db668a5ff4d6395d792e00d20999e7 Mon Sep 17 00:00:00 2001 From: Rodos Date: Wed, 18 Feb 2026 20:10:48 +1100 Subject: [PATCH 01/35] Fix empty repository crash due to None timestamp comparison (#489) Empty repositories have None for pushed_at/updated_at, causing a TypeError when compared to the last_update string. Use .get() with truthiness check to skip None timestamps in incremental tracking. --- github_backup/github_backup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index ada2d40..4d5394e 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -1772,9 +1772,9 @@ def backup_repositories(args, output_directory, repositories): last_update = "0000-00-00T00:00:00Z" for repository in repositories: - if "updated_at" in repository and repository["updated_at"] > last_update: + if repository.get("updated_at") and repository["updated_at"] > last_update: last_update = repository["updated_at"] - elif "pushed_at" in repository and repository["pushed_at"] > last_update: + elif repository.get("pushed_at") and repository["pushed_at"] > last_update: last_update = repository["pushed_at"] if repository.get("is_gist"): From 68af1d406a5ee0249829b24972e0d9bc77320a5a Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 18 Feb 2026 21:04:32 +0000 Subject: [PATCH 02/35] Release version 0.61.5 --- CHANGES.rst | 12 +++++++++++- github_backup/__init__.py | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 808da6b..6041b9e 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,9 +1,19 @@ Changelog ========= -0.61.4 (2026-02-16) +0.61.5 (2026-02-18) ------------------- ------------------------ +- Fix empty repository crash due to None timestamp comparison (#489) + [Rodos] + + Empty repositories have None for pushed_at/updated_at, causing a + TypeError when compared to the last_update string. Use .get() with + truthiness check to skip None timestamps in incremental tracking. + + +0.61.4 (2026-02-16) +------------------- - Fix HTTP 451 DMCA and 403 TOS handling regression (#487) [Rodos] The DMCA handling added in PR #454 had a bug: make_request_with_retry() diff --git a/github_backup/__init__.py b/github_backup/__init__.py index 03f7dee..294be4d 100644 --- a/github_backup/__init__.py +++ b/github_backup/__init__.py @@ -1 +1 @@ -__version__ = "0.61.4" +__version__ = "0.61.5" From 8a0553a5b175a9f91449e6a29b37ceffeff26c1e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 04:33:49 +0000 Subject: [PATCH 03/35] chore(deps): bump docker/metadata-action from 5 to 6 Bumps [docker/metadata-action](https://github.com/docker/metadata-action) from 5 to 6. - [Release notes](https://github.com/docker/metadata-action/releases) - [Commits](https://github.com/docker/metadata-action/compare/v5...v6) --- updated-dependencies: - dependency-name: docker/metadata-action dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index f367b99..1aa81fe 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -57,7 +57,7 @@ jobs: - name: Extract metadata (tags, labels) for Docker id: meta - uses: docker/metadata-action@v5 + uses: docker/metadata-action@v6 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | From 7f1807aaf82ac3565e1e4f1261644b376d0a5600 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 04:33:53 +0000 Subject: [PATCH 04/35] chore(deps): bump docker/setup-buildx-action from 3 to 4 Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 3 to 4. - [Release notes](https://github.com/docker/setup-buildx-action/releases) - [Commits](https://github.com/docker/setup-buildx-action/compare/v3...v4) --- updated-dependencies: - dependency-name: docker/setup-buildx-action dependency-version: '4' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index f367b99..b9103c5 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -46,7 +46,7 @@ jobs: uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@v4 - name: Log in to the Container registry uses: docker/login-action@v3 From cceef92346fb8c6fb672b29b8f0917e95cbcb591 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 04:33:55 +0000 Subject: [PATCH 05/35] chore(deps): bump docker/setup-qemu-action from 3 to 4 Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 3 to 4. - [Release notes](https://github.com/docker/setup-qemu-action/releases) - [Commits](https://github.com/docker/setup-qemu-action/compare/v3...v4) --- updated-dependencies: - dependency-name: docker/setup-qemu-action dependency-version: '4' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index f367b99..749ed52 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -43,7 +43,7 @@ jobs: persist-credentials: false - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + uses: docker/setup-qemu-action@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 From 5758e489e82305bfcdc02cf643c6c543b489ebb7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 04:33:58 +0000 Subject: [PATCH 06/35] chore(deps): bump docker/build-push-action from 6 to 7 Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 6 to 7. - [Release notes](https://github.com/docker/build-push-action/releases) - [Commits](https://github.com/docker/build-push-action/compare/v6...v7) --- updated-dependencies: - dependency-name: docker/build-push-action dependency-version: '7' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index f367b99..00fdec3 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -68,7 +68,7 @@ jobs: type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'main') }} - name: Build and push Docker image - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: context: . push: true From d5be07ec809c9c0ca7bfafc80345f09c9baf532b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 13:28:37 +0000 Subject: [PATCH 07/35] chore(deps): bump the python-packages group with 2 updates Bumps the python-packages group with 2 updates: [black](https://github.com/psf/black) and [setuptools](https://github.com/pypa/setuptools). Updates `black` from 26.1.0 to 26.3.0 - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/26.1.0...26.3.0) Updates `setuptools` from 82.0.0 to 82.0.1 - [Release notes](https://github.com/pypa/setuptools/releases) - [Changelog](https://github.com/pypa/setuptools/blob/main/NEWS.rst) - [Commits](https://github.com/pypa/setuptools/compare/v82.0.0...v82.0.1) --- updated-dependencies: - dependency-name: black dependency-version: 26.3.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: python-packages - dependency-name: setuptools dependency-version: 82.0.1 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: python-packages ... Signed-off-by: dependabot[bot] --- release-requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/release-requirements.txt b/release-requirements.txt index 6742290..65a036b 100644 --- a/release-requirements.txt +++ b/release-requirements.txt @@ -1,6 +1,6 @@ # Linting & Formatting autopep8==2.3.2 -black==26.1.0 +black==26.3.0 flake8==7.3.0 # Testing @@ -9,7 +9,7 @@ pytest==9.0.2 # Release & Publishing twine==6.2.0 gitchangelog==3.0.4 -setuptools==82.0.0 +setuptools==82.0.1 # Documentation restructuredtext-lint==2.0.2 From 3d961d11184f1fc384a8be290347b1de1e5064fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 17:26:41 +0000 Subject: [PATCH 08/35] chore(deps): bump docker/login-action from 3 to 4 Bumps [docker/login-action](https://github.com/docker/login-action) from 3 to 4. - [Release notes](https://github.com/docker/login-action/releases) - [Commits](https://github.com/docker/login-action/compare/v3...v4) --- updated-dependencies: - dependency-name: docker/login-action dependency-version: '4' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 9508f94..4e5c89b 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -49,7 +49,7 @@ jobs: uses: docker/setup-buildx-action@v4 - name: Log in to the Container registry - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} From f85c759e5df58bb5c1c680943bedbf03b9141afb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 12 Mar 2026 13:05:24 +0000 Subject: [PATCH 09/35] chore(deps): bump black in the python-packages group Bumps the python-packages group with 1 update: [black](https://github.com/psf/black). Updates `black` from 26.3.0 to 26.3.1 - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/26.3.0...26.3.1) --- updated-dependencies: - dependency-name: black dependency-version: 26.3.1 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: python-packages ... Signed-off-by: dependabot[bot] --- release-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release-requirements.txt b/release-requirements.txt index 65a036b..ddc1430 100644 --- a/release-requirements.txt +++ b/release-requirements.txt @@ -1,6 +1,6 @@ # Linting & Formatting autopep8==2.3.2 -black==26.3.0 +black==26.3.1 flake8==7.3.0 # Testing From 9fde6ed1ffff0660b8ead272c4993bd472312762 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 Apr 2026 13:05:48 +0000 Subject: [PATCH 10/35] chore(deps): bump pytest in the python-packages group Bumps the python-packages group with 1 update: [pytest](https://github.com/pytest-dev/pytest). Updates `pytest` from 9.0.2 to 9.0.3 - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/9.0.2...9.0.3) --- updated-dependencies: - dependency-name: pytest dependency-version: 9.0.3 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: python-packages ... Signed-off-by: dependabot[bot] --- release-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release-requirements.txt b/release-requirements.txt index ddc1430..ad8bc5c 100644 --- a/release-requirements.txt +++ b/release-requirements.txt @@ -4,7 +4,7 @@ black==26.3.1 flake8==7.3.0 # Testing -pytest==9.0.2 +pytest==9.0.3 # Release & Publishing twine==6.2.0 From f4117990b29b8f50ad3c57c86c5af1f9700c1b9c Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 13:42:14 +0200 Subject: [PATCH 11/35] Add --token-from-gh authentication option --- CHANGES.rst | 5 +++ README.rst | 7 ++-- github_backup/github_backup.py | 48 +++++++++++++++++++++++-- tests/test_auth.py | 65 ++++++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 4 deletions(-) create mode 100644 tests/test_auth.py diff --git a/CHANGES.rst b/CHANGES.rst index 6041b9e..364bd3d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,11 @@ Changelog ========= +Unreleased +---------- +- Add ``--token-from-gh`` to read authentication from ``gh auth token``. + + 0.61.5 (2026-02-18) ------------------- ------------------------ diff --git a/README.rst b/README.rst index cd7be1f..030f260 100644 --- a/README.rst +++ b/README.rst @@ -36,8 +36,8 @@ Show the CLI help output:: CLI Help output:: - github-backup [-h] [-t TOKEN_CLASSIC] [-f TOKEN_FINE] [-q] [--as-app] - [-o OUTPUT_DIRECTORY] [-l LOG_LEVEL] [-i] + github-backup [-h] [-t TOKEN_CLASSIC] [-f TOKEN_FINE] [--token-from-gh] + [-q] [--as-app] [-o OUTPUT_DIRECTORY] [-l LOG_LEVEL] [-i] [--incremental-by-files] [--starred] [--all-starred] [--starred-skip-size-over MB] [--watched] [--followers] [--following] [--all] @@ -71,6 +71,7 @@ CLI Help output:: -f, --token-fine TOKEN_FINE fine-grained personal access token (github_pat_....), or path to token (file://...) + --token-from-gh read token from GitHub CLI (gh auth token) -q, --quiet supress log messages less severe than warning, e.g. info --as-app authenticate as github app instead of as a user. @@ -171,6 +172,8 @@ The positional argument ``USER`` specifies the user or organization account you **Classic tokens** (``-t TOKEN``) are `slightly less secure `_ as they provide very coarse-grained permissions. +If you already authenticate with the `GitHub CLI `_, you can use ``--token-from-gh`` to read the token with ``gh auth token`` instead of passing a token directly. This avoids placing the token in shell history or process arguments. When ``--github-host`` is set, the token is read with ``gh auth token --hostname HOST``. + Fine Tokens ~~~~~~~~~~~ diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 4d5394e..fd2fd99 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -167,6 +167,12 @@ def parse_args(args=None): dest="token_fine", help="fine-grained personal access token (github_pat_....), or path to token (file://...)", ) # noqa + parser.add_argument( + "--token-from-gh", + action="store_true", + dest="token_from_gh", + help="read token from GitHub CLI (gh auth token)", + ) parser.add_argument( "-q", "--quiet", @@ -537,8 +543,14 @@ def get_auth(args, encode=True, for_git_cli=False): raise Exception( "Fine-grained token supplied does not look like a GitHub PAT" ) - elif args.token_classic: - if args.token_classic.startswith(FILE_URI_PREFIX): + elif args.token_classic or args.token_from_gh: + if args.token_from_gh: + if args.as_app: + raise Exception( + "--token-from-gh cannot be used with --as-app; provide the app token with --token instead" + ) + args.token_classic = read_token_from_gh_cli(args) + elif args.token_classic.startswith(FILE_URI_PREFIX): args.token_classic = read_file_contents(args.token_classic) if not args.as_app: @@ -580,6 +592,38 @@ def read_file_contents(file_uri): return open(file_uri[len(FILE_URI_PREFIX) :], "rt").readline().strip() +def read_token_from_gh_cli(args): + cached_token = getattr(args, "_token_from_gh_value", None) + if cached_token: + return cached_token + + command = ["gh", "auth", "token"] + if args.github_host: + command.extend(["--hostname", get_github_host(args)]) + + try: + token = subprocess.check_output(command, stderr=subprocess.PIPE).decode( + "utf-8" + ).strip() + except FileNotFoundError: + raise Exception( + "Unable to read token from GitHub CLI: 'gh' executable not found" + ) + except subprocess.CalledProcessError as e: + stderr = e.stderr.decode("utf-8", errors="replace").strip() + if stderr: + raise Exception( + "Unable to read token from GitHub CLI: {0}".format(stderr) + ) + raise Exception("Unable to read token from GitHub CLI") + + if not token: + raise Exception("Unable to read token from GitHub CLI: token was empty") + + args._token_from_gh_value = token + return token + + def get_github_repo_url(args, repository): if repository.get("is_gist"): if args.prefer_ssh: diff --git a/tests/test_auth.py b/tests/test_auth.py new file mode 100644 index 0000000..504c822 --- /dev/null +++ b/tests/test_auth.py @@ -0,0 +1,65 @@ +"""Tests for authentication helpers.""" + +from unittest.mock import patch + +import pytest + +from github_backup import github_backup + + +def test_token_from_gh_flag_parses(): + args = github_backup.parse_args(["--token-from-gh", "testuser"]) + assert args.token_from_gh is True + + +def test_get_auth_reads_token_from_gh_cli(create_args): + args = create_args(token_from_gh=True) + + with patch( + "github_backup.github_backup.subprocess.check_output", + return_value=b"gho_test_token\n", + ) as mock_check_output: + auth = github_backup.get_auth(args, encode=False) + + assert auth == "gho_test_token:x-oauth-basic" + mock_check_output.assert_called_once_with( + ["gh", "auth", "token"], stderr=github_backup.subprocess.PIPE + ) + + +def test_get_auth_reads_token_from_gh_cli_for_enterprise_host(create_args): + args = create_args(token_from_gh=True, github_host="ghe.example.com") + + with patch( + "github_backup.github_backup.subprocess.check_output", + return_value=b"gho_enterprise_token\n", + ) as mock_check_output: + auth = github_backup.get_auth(args, encode=False) + + assert auth == "gho_enterprise_token:x-oauth-basic" + mock_check_output.assert_called_once_with( + ["gh", "auth", "token", "--hostname", "ghe.example.com"], + stderr=github_backup.subprocess.PIPE, + ) + + +def test_token_from_gh_is_cached(create_args): + args = create_args(token_from_gh=True) + + with patch( + "github_backup.github_backup.subprocess.check_output", + return_value=b"gho_cached_token\n", + ) as mock_check_output: + assert github_backup.get_auth(args, encode=False) == "gho_cached_token:x-oauth-basic" + assert github_backup.get_auth(args, encode=False) == "gho_cached_token:x-oauth-basic" + + mock_check_output.assert_called_once() + + +def test_token_from_gh_rejects_as_app(create_args): + args = create_args(token_from_gh=True, as_app=True) + + with pytest.raises(Exception) as exc_info: + github_backup.get_auth(args, encode=False) + + assert "--token-from-gh cannot be used with --as-app" in str(exc_info.value) From 4d022d94d0c7656a481651d8310a23e97a7db7fd Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 13:45:29 +0200 Subject: [PATCH 12/35] Add support for discussions Closes #290 --- CHANGES.rst | 2 + README.rst | 34 ++- github_backup/github_backup.py | 495 +++++++++++++++++++++++++++++-- github_backup/graphql_queries.py | 292 ++++++++++++++++++ tests/test_auth.py | 10 + tests/test_discussions.py | 222 ++++++++++++++ tests/test_retrieve_data.py | 28 ++ 7 files changed, 1042 insertions(+), 41 deletions(-) create mode 100644 github_backup/graphql_queries.py create mode 100644 tests/test_discussions.py diff --git a/CHANGES.rst b/CHANGES.rst index 364bd3d..50f8d54 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -3,6 +3,8 @@ Changelog Unreleased ---------- +- Add GitHub Discussions backups via GraphQL, including comments, replies, + optional attachment downloads, and per-repository incremental checkpoints. - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/README.rst b/README.rst index 030f260..4135743 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ github-backup |PyPI| |Python Versions| -The package can be used to backup an *entire* `Github `_ organization, repository or user account, including starred repos, issues and wikis in the most appropriate format (clones for wikis, json files for issues). +The package can be used to backup an *entire* `Github `_ organization, repository or user account, including starred repos, issues, discussions and wikis in the most appropriate format (clones for wikis, json files for issues and discussions). Requirements ============ @@ -44,8 +44,9 @@ CLI Help output:: [--issues] [--issue-comments] [--issue-events] [--pulls] [--pull-comments] [--pull-commits] [--pull-details] [--labels] [--hooks] [--milestones] [--security-advisories] - [--repositories] [--bare] [--no-prune] [--lfs] [--wikis] - [--gists] [--starred-gists] [--skip-archived] [--skip-existing] + [--discussions] [--repositories] [--bare] [--no-prune] + [--lfs] [--wikis] [--gists] [--starred-gists] + [--skip-archived] [--skip-existing] [-L [LANGUAGES ...]] [-N NAME_REGEX] [-H GITHUB_HOST] [-O] [-R REPOSITORY] [-P] [-F] [--prefer-ssh] [-v] [--keychain-name OSX_KEYCHAIN_ITEM_NAME] @@ -104,6 +105,7 @@ CLI Help output:: --milestones include milestones in backup --security-advisories include security advisories in backup + --discussions include discussions in backup --repositories include repository clone in backup --bare clone bare repositories --no-prune disable prune option for git fetch @@ -144,8 +146,8 @@ CLI Help output:: applies if including releases --skip-assets-on [SKIP_ASSETS_ON ...] skip asset downloads for these repositories - --attachments download user-attachments from issues and pull - requests + --attachments download user-attachments from issues, pull requests, + and discussions --throttle-limit THROTTLE_LIMIT start throttling of GitHub API requests after this amount of API requests remain @@ -184,7 +186,7 @@ Customise the permissions for your use case, but for a personal account full bac **User permissions**: Read access to followers, starring, and watching. -**Repository permissions**: Read access to contents, issues, metadata, pull requests, and webhooks. +**Repository permissions**: Read access to contents, discussions, issues, metadata, pull requests, and webhooks. GitHub Apps @@ -265,9 +267,9 @@ LFS objects are fetched for all refs, not just the current checkout, ensuring a About Attachments ----------------- -When you use the ``--attachments`` option with ``--issues`` or ``--pulls``, the tool will download user-uploaded attachments (images, videos, documents, etc.) from issue and pull request descriptions and comments. In some circumstances attachments contain valuable data related to the topic, and without their backup important information or context might be lost inadvertently. +When you use the ``--attachments`` option with ``--issues``, ``--pulls`` or ``--discussions``, the tool will download user-uploaded attachments (images, videos, documents, etc.) from issue, pull request and discussion descriptions and comments. In some circumstances attachments contain valuable data related to the topic, and without their backup important information or context might be lost inadvertently. -Attachments are saved to ``issues/attachments/{issue_number}/`` and ``pulls/attachments/{pull_number}/`` directories, where ``{issue_number}`` is the GitHub issue number (e.g., issue #123 saves to ``issues/attachments/123/``). Each attachment directory contains: +Attachments are saved to ``issues/attachments/{issue_number}/``, ``pulls/attachments/{pull_number}/`` and ``discussions/attachments/{discussion_number}/`` directories, where ``{issue_number}`` is the GitHub issue number (e.g., issue #123 saves to ``issues/attachments/123/``). Each attachment directory contains: - The downloaded attachment files (named by their GitHub identifier with appropriate file extensions) - If multiple attachments have the same filename, conflicts are resolved with numeric suffixes (e.g., ``report.pdf``, ``report_1.pdf``, ``report_2.pdf``) @@ -287,6 +289,16 @@ The tool automatically extracts file extensions from HTTP headers to ensure file **Fine-grained token limitation:** Due to a GitHub platform limitation, fine-grained personal access tokens (``github_pat_...``) cannot download attachments from private repositories directly. This affects both ``/assets/`` (images) and ``/files/`` (documents) URLs. The tool implements a workaround for image attachments using GitHub's Markdown API, which converts URLs to temporary JWT-signed URLs that can be downloaded. However, this workaround only works for images - document attachments (PDFs, text files, etc.) will fail with 404 errors when using fine-grained tokens on private repos. For full attachment support on private repositories, use a classic token (``-t``) instead of a fine-grained token (``-f``). See `#477 `_ for details. +About Discussions +----------------- + +GitHub Discussions are backed up with GitHub's GraphQL API because the REST API does not expose discussions. Use ``--discussions`` to save each discussion as JSON under ``repositories/{repo}/discussions/{number}.json``. Discussion backups include the discussion body and metadata, category information, comments, and comment replies. + +``--discussions`` is included in ``--all``. Unlike most REST API-backed resources, discussions require authentication because GitHub's GraphQL API requires a token. Fine-grained personal access tokens and GitHub Apps need read access to the repository's Discussions permission. + +Incremental backups use a per-repository checkpoint at ``repositories/{repo}/discussions/last_update`` based on discussion ``updatedAt`` timestamps. This is separate from the repository-level ``last_update`` file so discussion activity is not missed if the repository's own update timestamp does not change. If you enable ``--discussions`` on an existing incremental backup, the first run performs a full discussions backup for each repository and creates the discussions checkpoint for future runs. + + About security advisories ------------------------- @@ -419,14 +431,14 @@ Quietly and incrementally backup useful Github user data (public and private rep export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER - github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --security-advisories --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER + github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER Debug an error/block or incomplete backup into a temporary directory. Omit "incremental" to fill a previous incomplete backup. :: export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER - github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER + github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER Pipe a token from stdin to avoid storing it in environment variables or command history (Unix-like systems only):: @@ -442,7 +454,7 @@ This tool creates backups only, there is no inbuilt restore command. cd /tmp/white-house/repositories/petitions/repository git push --mirror git@github.com:WhiteHouse/petitions.git -**Issues, pull requests, comments, and other metadata** are saved as JSON files for archival purposes. The GitHub API does not support recreating this data faithfully, creating issues via the API has limitations: +**Issues, pull requests, discussions, comments, and other metadata** are saved as JSON files for archival purposes. The GitHub API does not support recreating this data faithfully, creating issues via the API has limitations: - New issue/PR numbers are assigned (original numbers cannot be set) - Timestamps reflect creation time (original dates cannot be set) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index fd2fd99..c1245bd 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -33,6 +33,13 @@ except ImportError: VERSION = "unknown" +from .graphql_queries import ( + DISCUSSION_DETAIL_QUERY, + DISCUSSION_LIST_QUERY, + DISCUSSION_PAGE_SIZE, + DISCUSSION_REPLIES_QUERY, +) + FNULL = open(os.devnull, "w") FILE_URI_PREFIX = "file://" logger = logging.getLogger(__name__) @@ -322,6 +329,12 @@ def parse_args(args=None): dest="include_security_advisories", help="include security advisories in backup", ) + parser.add_argument( + "--discussions", + action="store_true", + dest="include_discussions", + help="include discussions in backup", + ) parser.add_argument( "--repositories", action="store_true", @@ -469,7 +482,7 @@ def parse_args(args=None): "--attachments", action="store_true", dest="include_attachments", - help="download user-attachments from issues and pull requests", + help="download user-attachments from issues, pull requests, and discussions", ) parser.add_argument( "--throttle-limit", @@ -579,6 +592,31 @@ def get_github_api_host(args): return host +def get_github_graphql_url(args): + if args.github_host: + return "https://{0}/api/graphql".format(args.github_host) + + return "https://api.github.com/graphql" + + +def get_graphql_auth(args): + auth = get_auth(args, encode=False) + if not auth: + return None + + # GraphQL expects a bearer token. Classic tokens and keychain tokens use + # "token:x-oauth-basic" for REST Basic auth, so strip the synthetic + # password before sending the GraphQL Authorization header. + if ( + not getattr(args, "as_app", False) + and getattr(args, "token_fine", None) is None + and ":" in auth + ): + auth = auth.split(":", 1)[0] + + return auth + + def get_github_host(args): if args.github_host: host = args.github_host @@ -810,6 +848,87 @@ def _extract_legal_url(response_body_bytes): return list(fetch_all()) +def retrieve_graphql_data(args, query, variables=None, log_context=None): + """Fetch data from GitHub's GraphQL API.""" + auth = get_graphql_auth(args) + if not auth: + raise Exception("GitHub GraphQL API requires authentication") + + variables = variables or {} + payload = json.dumps( + {"query": query, "variables": variables}, ensure_ascii=False + ).encode("utf-8") + endpoint = get_github_graphql_url(args) + + for attempt in range(args.max_retries + 1): + request = Request(endpoint, data=payload, method="POST") + request.add_header("Accept", "application/json") + request.add_header("Content-Type", "application/json") + request.add_header("Authorization", "bearer " + auth) + log_url = endpoint + if log_context: + log_url = "{0} ({1})".format(log_url, log_context) + logger.info("Requesting {0}".format(log_url)) + + http_response = make_request_with_retry(request, auth, args.max_retries) + + status = http_response.getcode() + if status != 200: + raise Exception( + f"Unexpected HTTP {status} from {endpoint} " + f"(expected non-2xx to raise HTTPError)" + ) + + try: + response = json.loads(http_response.read().decode("utf-8")) + except (IncompleteRead, json.decoder.JSONDecodeError, TimeoutError) as e: + logger.warning(f"{type(e).__name__} reading GraphQL response") + if attempt < args.max_retries: + delay = calculate_retry_delay(attempt, {}) + logger.warning( + f"Retrying GraphQL read in {delay:.1f}s " + f"(attempt {attempt + 1}/{args.max_retries + 1})" + ) + time.sleep(delay) + continue + raise Exception( + f"Failed to read GraphQL response after {args.max_retries + 1} " + f"attempts for {endpoint}" + ) + + if ( + remaining := int(http_response.headers.get("x-ratelimit-remaining", 0)) + ) <= (args.throttle_limit or 0): + if args.throttle_limit: + logger.info( + f"Throttling: {remaining} requests left, pausing {args.throttle_pause}s" + ) + time.sleep(args.throttle_pause) + + errors = response.get("errors") or [] + if errors: + if any(error.get("type") == "RATE_LIMITED" for error in errors): + if attempt < args.max_retries: + delay = calculate_retry_delay(attempt, http_response.headers) + logger.warning( + f"GraphQL rate limit hit, retrying in {delay:.1f}s " + f"(attempt {attempt + 1}/{args.max_retries + 1})" + ) + time.sleep(delay) + continue + + messages = "; ".join( + error.get("message", str(error)) for error in errors + ) + raise Exception("GraphQL Error: {0}".format(messages)) + + return response.get("data", {}) + + raise Exception( + f"GraphQL request failed after {args.max_retries + 1} attempts" + ) # pragma: no cover + + def make_request_with_retry(request, auth, max_retries=5): """Make HTTP request with automatic retry for transient errors.""" @@ -1193,7 +1312,7 @@ def get_jwt_signed_url_via_markdown_api(url, token, repo_context): def extract_attachment_urls(item_data, issue_number=None, repository_full_name=None): - """Extract GitHub-hosted attachment URLs from issue/PR body and comments. + """Extract GitHub-hosted attachment URLs from issue/PR/discussion body and comments. What qualifies as an attachment? There is no "attachment" concept in the GitHub API - it's a user behavior pattern @@ -1335,33 +1454,29 @@ def redirect_request(self, req, fp, code, msg, headers, newurl): # and exclude the URL to avoid downloading from wrong repos return False + def extract_from_text(text): + text_cleaned = remove_code_blocks(text or "") + for pattern in patterns: + found_urls = re.findall(pattern, text_cleaned) + urls.extend([clean_url(url) for url in found_urls]) + + def extract_from_comments(comments): + for comment in comments: + extract_from_text(comment.get("body") or "") + # GitHub Discussions support one level of replies. Issues and pull + # requests don't have reply_data, so this is a no-op for them. + extract_from_comments(comment.get("reply_data") or []) + # Extract from body - body = item_data.get("body") or "" - # Remove code blocks before searching for URLs - body_cleaned = remove_code_blocks(body) - for pattern in patterns: - found_urls = re.findall(pattern, body_cleaned) - urls.extend([clean_url(url) for url in found_urls]) - - # Extract from issue comments + extract_from_text(item_data.get("body") or "") + + # Extract from issue comments and discussion comments if "comment_data" in item_data: - for comment in item_data["comment_data"]: - comment_body = comment.get("body") or "" - # Remove code blocks before searching for URLs - comment_cleaned = remove_code_blocks(comment_body) - for pattern in patterns: - found_urls = re.findall(pattern, comment_cleaned) - urls.extend([clean_url(url) for url in found_urls]) + extract_from_comments(item_data["comment_data"]) # Extract from PR regular comments if "comment_regular_data" in item_data: - for comment in item_data["comment_regular_data"]: - comment_body = comment.get("body") or "" - # Remove code blocks before searching for URLs - comment_cleaned = remove_code_blocks(comment_body) - for pattern in patterns: - found_urls = re.findall(pattern, comment_cleaned) - urls.extend([clean_url(url) for url in found_urls]) + extract_from_comments(item_data["comment_regular_data"]) regex_urls = list(set(urls)) # dedupe @@ -1463,20 +1578,24 @@ def resolve_filename_collision(filepath): def download_attachments( args, item_cwd, item_data, number, repository, item_type="issue" ): - """Download user-attachments from issue/PR body and comments with manifest. + """Download user-attachments from issue/PR/discussion body and comments with manifest. Args: args: Command line arguments - item_cwd: Working directory (issue_cwd or pulls_cwd) - item_data: Issue or PR data dict - number: Issue or PR number + item_cwd: Working directory (issue_cwd, pulls_cwd, or discussion_cwd) + item_data: Issue, PR, or discussion data dict + number: Issue, PR, or discussion number repository: Repository dict - item_type: "issue" or "pull" for logging/manifest + item_type: "issue", "pull", or "discussion" for logging/manifest """ import json from datetime import datetime, timezone - item_type_display = "issue" if item_type == "issue" else "pull request" + item_type_display = { + "issue": "issue", + "pull": "pull request", + "discussion": "discussion", + }.get(item_type, item_type) urls = extract_attachment_urls( item_data, issue_number=number, repository_full_name=repository["full_name"] @@ -1621,6 +1740,8 @@ def download_attachments( # Write manifest if attachment_metadata_list: manifest = { + "item_number": number, + "item_type": item_type, "issue_number": number, "issue_type": item_type, "repository": ( @@ -1888,6 +2009,9 @@ def backup_repositories(args, output_directory, repositories): if args.include_pulls or args.include_everything: backup_pulls(args, repo_cwd, repository, repos_template) + if args.include_discussions or args.include_everything: + backup_discussions(args, repo_cwd, repository) + if args.include_milestones or args.include_everything: backup_milestones(args, repo_cwd, repository, repos_template) @@ -1922,6 +2046,317 @@ def backup_repositories(args, output_directory, repositories): open(last_update_path, "w").write(last_update) +def _repository_owner_name(repository): + return repository["full_name"].split("/", 1) + + +def _connection_nodes(connection): + return [node for node in (connection or {}).get("nodes") or [] if node] + + +def retrieve_discussion_summaries(args, repository, since=None): + owner, name = _repository_owner_name(repository) + after = None + page = 1 + summaries = [] + newest_seen = None + discussions_enabled = None + total_count = 0 + + while True: + data = retrieve_graphql_data( + args, + DISCUSSION_LIST_QUERY, + { + "owner": owner, + "name": name, + "after": after, + "pageSize": DISCUSSION_PAGE_SIZE, + }, + log_context="discussion summaries {0} page {1}".format( + repository["full_name"], page + ), + ) + repository_data = data.get("repository") + if repository_data is None: + raise Exception( + "Repository {0} not found in GraphQL response".format( + repository["full_name"] + ) + ) + + discussions_enabled = repository_data.get("hasDiscussionsEnabled") + if not discussions_enabled: + return [], None, False, 0 + + discussions = repository_data.get("discussions") or {} + total_count = discussions.get("totalCount", total_count) + stop = False + + for discussion in _connection_nodes(discussions): + updated_at = discussion.get("updatedAt") + if updated_at and (newest_seen is None or updated_at > newest_seen): + newest_seen = updated_at + + if since and updated_at and updated_at < since: + stop = True + break + + summaries.append(discussion) + + page_info = discussions.get("pageInfo") or {} + if stop or not page_info.get("hasNextPage"): + break + + after = page_info.get("endCursor") + page += 1 + + return summaries, newest_seen, discussions_enabled, total_count + + +def retrieve_discussion_comment_replies(args, comment_id, after=None, log_context=None): + data = retrieve_graphql_data( + args, + DISCUSSION_REPLIES_QUERY, + { + "commentId": comment_id, + "repliesCursor": after, + "pageSize": DISCUSSION_PAGE_SIZE, + }, + log_context=log_context, + ) + node = data.get("node") or {} + return node.get("replies") or {} + + +def _discussion_comment_log_identifier(comment_node): + return ( + comment_node.get("databaseId") + or comment_node.get("url") + or comment_node.get("id") + ) + + +def _discussion_comment_with_replies( + args, comment_node, repository_full_name=None, discussion_number=None +): + replies_connection = comment_node.get("replies") or {} + replies = _connection_nodes(replies_connection) + reply_total_count = replies_connection.get("totalCount", len(replies)) + page_info = replies_connection.get("pageInfo") or {} + reply_page = 2 + + while page_info.get("hasNextPage"): + log_context = None + if repository_full_name and discussion_number is not None: + log_context = "discussion {0}#{1} comment {2} replies page {3}".format( + repository_full_name, + discussion_number, + _discussion_comment_log_identifier(comment_node), + reply_page, + ) + + replies_connection = retrieve_discussion_comment_replies( + args, + comment_node["id"], + page_info.get("endCursor"), + log_context=log_context, + ) + replies.extend(_connection_nodes(replies_connection)) + page_info = replies_connection.get("pageInfo") or {} + reply_page += 1 + + comment = {key: value for key, value in comment_node.items() if key != "replies"} + comment["reply_count"] = reply_total_count + comment["reply_data"] = replies + return comment + + +def retrieve_discussion(args, repository, number): + owner, name = _repository_owner_name(repository) + comments_cursor = None + comments_page = 1 + discussion_data = None + comments = [] + comment_total_count = 0 + + while True: + data = retrieve_graphql_data( + args, + DISCUSSION_DETAIL_QUERY, + { + "owner": owner, + "name": name, + "number": number, + "commentsCursor": comments_cursor, + "pageSize": DISCUSSION_PAGE_SIZE, + }, + log_context="discussion {0}#{1} details/comments page {2}".format( + repository["full_name"], number, comments_page + ), + ) + repository_data = data.get("repository") or {} + discussion = repository_data.get("discussion") + if discussion is None: + raise Exception( + "Discussion #{0} not found in {1}".format( + number, repository["full_name"] + ) + ) + + if discussion_data is None: + discussion_data = { + key: value for key, value in discussion.items() if key != "comments" + } + + comments_connection = discussion.get("comments") or {} + comment_total_count = comments_connection.get( + "totalCount", comment_total_count + ) + for comment_node in _connection_nodes(comments_connection): + comments.append( + _discussion_comment_with_replies( + args, comment_node, repository["full_name"], number + ) + ) + + page_info = comments_connection.get("pageInfo") or {} + if not page_info.get("hasNextPage"): + break + + comments_cursor = page_info.get("endCursor") + comments_page += 1 + + discussion_data["comment_count"] = comment_total_count + discussion_data["comment_data"] = comments + return discussion_data + + +def backup_discussions(args, repo_cwd, repository): + discussion_cwd = os.path.join(repo_cwd, "discussions") + if args.skip_existing and os.path.isdir(discussion_cwd): + return + + if not get_graphql_auth(args): + logger.info( + "Skipping {0} discussions since GitHub GraphQL API requires authentication".format( + repository["full_name"] + ) + ) + return + + discussions_since = None + discussion_last_update_path = os.path.join(discussion_cwd, "last_update") + if args.incremental and os.path.exists(discussion_last_update_path): + discussions_since = open(discussion_last_update_path).read().strip() + + logger.info("Retrieving {0} discussions".format(repository["full_name"])) + try: + ( + summaries, + newest_seen, + discussions_enabled, + total_count, + ) = retrieve_discussion_summaries(args, repository, since=discussions_since) + except Exception as e: + logger.warning( + "Unable to retrieve discussions for {0}, skipping: {1}".format( + repository["full_name"], e + ) + ) + return + + if not discussions_enabled: + logger.info( + "Discussions are not enabled for {0}, skipping".format( + repository["full_name"] + ) + ) + return + + mkdir_p(repo_cwd, discussion_cwd) + + if discussions_since: + logger.info( + "Saving {0} updated discussions to disk ({1} total)".format( + len(summaries), total_count + ) + ) + else: + logger.info("Saving {0} discussions to disk".format(len(summaries))) + + written_count = 0 + skipped_count = 0 + had_errors = False + for summary in summaries: + number = summary["number"] + discussion_file = os.path.join(discussion_cwd, "{0}.json".format(number)) + + if args.incremental_by_files and os.path.isfile(discussion_file): + modified = os.path.getmtime(discussion_file) + modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ") + if modified > summary["updatedAt"]: + logger.info( + "Skipping discussion {0} because it wasn't modified since last backup".format( + number + ) + ) + skipped_count += 1 + continue + + try: + discussion = retrieve_discussion(args, repository, number) + except Exception as e: + logger.warning( + "Unable to retrieve discussion {0}#{1}, skipping: {2}".format( + repository["full_name"], number, e + ) + ) + had_errors = True + continue + + if args.include_attachments: + download_attachments( + args, + discussion_cwd, + discussion, + number, + repository, + item_type="discussion", + ) + + if json_dump_if_changed(discussion, discussion_file): + written_count += 1 + + if ( + args.incremental + and not had_errors + and newest_seen + and (not discussions_since or newest_seen > discussions_since) + ): + open(discussion_last_update_path, "w").write(newest_seen) + + attempted_count = len(summaries) - skipped_count + if not summaries: + logger.info("No discussions to save") + elif attempted_count == 0: + logger.info("{0} discussions skipped".format(skipped_count)) + elif written_count == attempted_count: + logger.info("Saved {0} discussions to disk".format(written_count)) + elif written_count == 0: + logger.info( + "{0} discussions unchanged, skipped write".format(attempted_count) + ) + else: + logger.info( + "Saved {0} discussions to disk ({1} unchanged, {2} skipped)".format( + written_count, + attempted_count - written_count, + skipped_count, + ) + ) + + def backup_issues(args, repo_cwd, repository, repos_template): has_issues_dir = os.path.isdir("{0}/issues/.git".format(repo_cwd)) if args.skip_existing and has_issues_dir: diff --git a/github_backup/graphql_queries.py b/github_backup/graphql_queries.py new file mode 100644 index 0000000..96eb552 --- /dev/null +++ b/github_backup/graphql_queries.py @@ -0,0 +1,292 @@ +"""GraphQL query templates used by github-backup.""" + +DISCUSSION_PAGE_SIZE = 100 + +DISCUSSION_LIST_QUERY = """ +query($owner: String!, $name: String!, $after: String, $pageSize: Int!) { + repository(owner: $owner, name: $name) { + hasDiscussionsEnabled + discussions( + first: $pageSize, + after: $after, + orderBy: {field: UPDATED_AT, direction: DESC} + ) { + totalCount + nodes { + id + number + title + updatedAt + } + pageInfo { + hasNextPage + endCursor + } + } + } +} +""" + +DISCUSSION_DETAIL_QUERY = """ +query( + $owner: String!, + $name: String!, + $number: Int!, + $commentsCursor: String, + $pageSize: Int! +) { + repository(owner: $owner, name: $name) { + discussion(number: $number) { + activeLockReason + answer { + id + databaseId + url + } + answerChosenAt + answerChosenBy { + ...ActorFields + } + author { + ...ActorFields + } + authorAssociation + body + bodyHTML + bodyText + category { + createdAt + description + emoji + emojiHTML + id + isAnswerable + name + slug + updatedAt + } + closed + closedAt + createdAt + createdViaEmail + databaseId + editor { + ...ActorFields + } + id + includesCreatedEdit + isAnswered + labels(first: 100) { + totalCount + nodes { + id + name + color + description + } + } + lastEditedAt + locked + number + poll { + id + question + totalVoteCount + options(first: 100) { + totalCount + nodes { + id + option + totalVoteCount + } + } + } + publishedAt + reactionGroups { + ...ReactionGroupFields + } + resourcePath + stateReason + title + updatedAt + upvoteCount + url + comments(first: $pageSize, after: $commentsCursor) { + totalCount + nodes { + ...DiscussionCommentFields + replies(first: $pageSize) { + totalCount + nodes { + ...DiscussionReplyFields + } + pageInfo { + hasNextPage + endCursor + } + } + } + pageInfo { + hasNextPage + endCursor + } + } + } + } +} + +fragment ActorFields on Actor { + avatarUrl + login + resourcePath + url +} + +fragment ReactionGroupFields on ReactionGroup { + content + reactors { + totalCount + } +} + +fragment DiscussionCommentFields on DiscussionComment { + author { + ...ActorFields + } + authorAssociation + body + bodyHTML + bodyText + createdAt + createdViaEmail + databaseId + deletedAt + editor { + ...ActorFields + } + id + includesCreatedEdit + isAnswer + isMinimized + lastEditedAt + minimizedReason + publishedAt + reactionGroups { + ...ReactionGroupFields + } + replyTo { + id + databaseId + url + } + resourcePath + updatedAt + upvoteCount + url +} + +fragment DiscussionReplyFields on DiscussionComment { + author { + ...ActorFields + } + authorAssociation + body + bodyHTML + bodyText + createdAt + createdViaEmail + databaseId + deletedAt + editor { + ...ActorFields + } + id + includesCreatedEdit + isAnswer + isMinimized + lastEditedAt + minimizedReason + publishedAt + reactionGroups { + ...ReactionGroupFields + } + replyTo { + id + databaseId + url + } + resourcePath + updatedAt + upvoteCount + url +} +""" + +DISCUSSION_REPLIES_QUERY = """ +query($commentId: ID!, $repliesCursor: String, $pageSize: Int!) { + node(id: $commentId) { + ... on DiscussionComment { + replies(first: $pageSize, after: $repliesCursor) { + totalCount + nodes { + ...DiscussionReplyFields + } + pageInfo { + hasNextPage + endCursor + } + } + } + } +} + +fragment ActorFields on Actor { + avatarUrl + login + resourcePath + url +} + +fragment ReactionGroupFields on ReactionGroup { + content + reactors { + totalCount + } +} + +fragment DiscussionReplyFields on DiscussionComment { + author { + ...ActorFields + } + authorAssociation + body + bodyHTML + bodyText + createdAt + createdViaEmail + databaseId + deletedAt + editor { + ...ActorFields + } + id + includesCreatedEdit + isAnswer + isMinimized + lastEditedAt + minimizedReason + publishedAt + reactionGroups { + ...ReactionGroupFields + } + replyTo { + id + databaseId + url + } + resourcePath + updatedAt + upvoteCount + url +} +""" diff --git a/tests/test_auth.py b/tests/test_auth.py index 504c822..0102878 100644 --- a/tests/test_auth.py +++ b/tests/test_auth.py @@ -56,6 +56,16 @@ def test_token_from_gh_is_cached(create_args): mock_check_output.assert_called_once() +def test_graphql_auth_strips_basic_auth_suffix_for_gh_cli_token(create_args): + args = create_args(token_from_gh=True) + + with patch( + "github_backup.github_backup.subprocess.check_output", + return_value=b"gho_graphql_token\n", + ): + assert github_backup.get_graphql_auth(args) == "gho_graphql_token" + + def test_token_from_gh_rejects_as_app(create_args): args = create_args(token_from_gh=True, as_app=True) diff --git a/tests/test_discussions.py b/tests/test_discussions.py new file mode 100644 index 0000000..89fd8dd --- /dev/null +++ b/tests/test_discussions.py @@ -0,0 +1,222 @@ +"""Tests for GitHub Discussions backup support.""" + +import json +import os +from unittest.mock import patch + +from github_backup import github_backup + + +def test_parse_args_discussions_flag(): + args = github_backup.parse_args(["--discussions", "testuser"]) + assert args.include_discussions is True + + +def test_retrieve_discussion_summaries_stops_at_incremental_since(create_args): + args = create_args() + repository = {"full_name": "owner/repo"} + + page = { + "repository": { + "hasDiscussionsEnabled": True, + "discussions": { + "totalCount": 3, + "nodes": [ + {"number": 3, "title": "new", "updatedAt": "2026-02-01T00:00:00Z"}, + {"number": 2, "title": "also new", "updatedAt": "2026-01-10T00:00:00Z"}, + {"number": 1, "title": "old", "updatedAt": "2025-12-01T00:00:00Z"}, + ], + "pageInfo": {"hasNextPage": True, "endCursor": "NEXT"}, + }, + } + } + + with patch( + "github_backup.github_backup.retrieve_graphql_data", return_value=page + ) as mock_retrieve: + summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries( + args, repository, since="2026-01-01T00:00:00Z" + ) + + assert enabled is True + assert total == 3 + assert newest == "2026-02-01T00:00:00Z" + assert [item["number"] for item in summaries] == [3, 2] + # The old discussion stops pagination, so the next page is not requested. + assert mock_retrieve.call_count == 1 + assert ( + mock_retrieve.call_args.kwargs["log_context"] + == "discussion summaries owner/repo page 1" + ) + + +def test_retrieve_discussion_summaries_disabled_discussions(create_args): + args = create_args() + repository = {"full_name": "owner/repo"} + + with patch( + "github_backup.github_backup.retrieve_graphql_data", + return_value={"repository": {"hasDiscussionsEnabled": False}}, + ): + summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries( + args, repository + ) + + assert summaries == [] + assert newest is None + assert enabled is False + assert total == 0 + + +def _comment(comment_id, body, replies=None, replies_has_next=False): + replies = replies or [] + return { + "id": comment_id, + "body": body, + "replies": { + "totalCount": len(replies) + (1 if replies_has_next else 0), + "nodes": replies, + "pageInfo": { + "hasNextPage": replies_has_next, + "endCursor": "REPLIES2" if replies_has_next else None, + }, + }, + } + + +def _discussion_page(comment_nodes, has_next=False): + return { + "repository": { + "discussion": { + "number": 42, + "title": "Discussion title", + "updatedAt": "2026-02-01T00:00:00Z", + "comments": { + "totalCount": 2, + "nodes": comment_nodes, + "pageInfo": { + "hasNextPage": has_next, + "endCursor": "COMMENTS2" if has_next else None, + }, + }, + } + } + } + + +def test_retrieve_discussion_paginates_comments_and_replies(create_args): + args = create_args() + repository = {"full_name": "owner/repo"} + + reply_1 = {"id": "reply-1", "body": "first reply"} + reply_2 = {"id": "reply-2", "body": "second reply"} + comment_1 = _comment("comment-1", "first comment", [reply_1], replies_has_next=True) + comment_2 = _comment("comment-2", "second comment") + + responses = [ + _discussion_page([comment_1], has_next=True), + { + "node": { + "replies": { + "totalCount": 2, + "nodes": [reply_2], + "pageInfo": {"hasNextPage": False, "endCursor": None}, + } + } + }, + _discussion_page([comment_2], has_next=False), + ] + + with patch( + "github_backup.github_backup.retrieve_graphql_data", side_effect=responses + ) as mock_retrieve: + discussion = github_backup.retrieve_discussion(args, repository, 42) + + assert discussion["number"] == 42 + assert discussion["comment_count"] == 2 + assert len(discussion["comment_data"]) == 2 + assert discussion["comment_data"][0]["body"] == "first comment" + assert discussion["comment_data"][0]["reply_count"] == 2 + assert [r["body"] for r in discussion["comment_data"][0]["reply_data"]] == [ + "first reply", + "second reply", + ] + assert discussion["comment_data"][1]["body"] == "second comment" + assert mock_retrieve.call_count == 3 + assert [ + call.kwargs["log_context"] for call in mock_retrieve.call_args_list + ] == [ + "discussion owner/repo#42 details/comments page 1", + "discussion owner/repo#42 comment comment-1 replies page 2", + "discussion owner/repo#42 details/comments page 2", + ] + + +def test_backup_discussions_uses_incremental_checkpoint(create_args, tmp_path): + args = create_args(token_classic="fake_token", include_discussions=True, incremental=True) + repository = {"full_name": "owner/repo"} + discussions_dir = tmp_path / "discussions" + discussions_dir.mkdir() + (discussions_dir / "last_update").write_text("2026-01-01T00:00:00Z") + + def fake_summaries(passed_args, passed_repository, since=None): + assert passed_args is args + assert passed_repository == repository + assert since == "2026-01-01T00:00:00Z" + return ( + [{"number": 7, "title": "updated", "updatedAt": "2026-02-01T00:00:00Z"}], + "2026-02-01T00:00:00Z", + True, + 1, + ) + + with patch( + "github_backup.github_backup.retrieve_discussion_summaries", + side_effect=fake_summaries, + ), patch( + "github_backup.github_backup.retrieve_discussion", + return_value={"number": 7, "title": "updated"}, + ): + github_backup.backup_discussions(args, tmp_path, repository) + + with open(discussions_dir / "7.json", encoding="utf-8") as f: + assert json.load(f) == {"number": 7, "title": "updated"} + assert (discussions_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + + +def test_backup_discussions_does_not_advance_checkpoint_on_discussion_error( + create_args, tmp_path +): + args = create_args(token_classic="fake_token", include_discussions=True, incremental=True) + repository = {"full_name": "owner/repo"} + discussions_dir = tmp_path / "discussions" + discussions_dir.mkdir() + (discussions_dir / "last_update").write_text("2026-01-01T00:00:00Z") + + with patch( + "github_backup.github_backup.retrieve_discussion_summaries", + return_value=( + [{"number": 7, "title": "updated", "updatedAt": "2026-02-01T00:00:00Z"}], + "2026-02-01T00:00:00Z", + True, + 1, + ), + ), patch( + "github_backup.github_backup.retrieve_discussion", + side_effect=Exception("temporary GraphQL error"), + ): + github_backup.backup_discussions(args, tmp_path, repository) + + assert (discussions_dir / "last_update").read_text() == "2026-01-01T00:00:00Z" + assert not os.path.exists(discussions_dir / "7.json") + + +def test_backup_discussions_skips_without_auth(create_args, tmp_path): + args = create_args(include_discussions=True) + repository = {"full_name": "owner/repo"} + + with patch("github_backup.github_backup.retrieve_discussion_summaries") as mock_retrieve: + github_backup.backup_discussions(args, tmp_path, repository) + + assert not mock_retrieve.called + assert not os.path.exists(tmp_path / "discussions") diff --git a/tests/test_retrieve_data.py b/tests/test_retrieve_data.py index 014c309..51848ef 100644 --- a/tests/test_retrieve_data.py +++ b/tests/test_retrieve_data.py @@ -1,6 +1,7 @@ """Tests for retrieve_data function.""" import json +import logging import socket from unittest.mock import Mock, patch from urllib.error import HTTPError, URLError @@ -355,6 +356,33 @@ def mock_urlopen(*args, **kwargs): ) # 1 initial + 5 retries = 6 attempts +class TestRetrieveGraphqlDataLogging: + """Tests for GraphQL request logging.""" + + def test_logs_graphql_context(self, create_args, caplog): + args = create_args(token_classic="fake_token") + mock_response = Mock() + mock_response.getcode.return_value = 200 + mock_response.read.return_value = json.dumps({"data": {}}).encode("utf-8") + mock_response.headers = {"x-ratelimit-remaining": "5000"} + + caplog.set_level(logging.INFO, logger="github_backup.github_backup") + with patch( + "github_backup.github_backup.make_request_with_retry", + return_value=mock_response, + ): + github_backup.retrieve_graphql_data( + args, + "query { viewer { login } }", + log_context="discussion owner/repo#1", + ) + + assert ( + "Requesting https://api.github.com/graphql (discussion owner/repo#1)" + in caplog.text + ) + + class TestRetrieveDataThrottling: """Tests for throttling behavior in retrieve_data.""" From 24b3fdb4f34f85be090c335426e41403331e3ddf Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 14:08:42 +0200 Subject: [PATCH 13/35] Add support for pull request reviews Closes #124 --- CHANGES.rst | 2 + README.rst | 16 ++- github_backup/github_backup.py | 148 ++++++++++++++++++-- tests/test_pull_reviews.py | 237 +++++++++++++++++++++++++++++++++ 4 files changed, 388 insertions(+), 15 deletions(-) create mode 100644 tests/test_pull_reviews.py diff --git a/CHANGES.rst b/CHANGES.rst index 50f8d54..b790ce1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,6 +5,8 @@ Unreleased ---------- - Add GitHub Discussions backups via GraphQL, including comments, replies, optional attachment downloads, and per-repository incremental checkpoints. +- Add pull request review backups with ``--pull-reviews`` and one-time + incremental backfill for existing backups. - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/README.rst b/README.rst index 4135743..52d7222 100644 --- a/README.rst +++ b/README.rst @@ -42,7 +42,8 @@ CLI Help output:: [--starred] [--all-starred] [--starred-skip-size-over MB] [--watched] [--followers] [--following] [--all] [--issues] [--issue-comments] [--issue-events] [--pulls] - [--pull-comments] [--pull-commits] [--pull-details] + [--pull-comments] [--pull-reviews] [--pull-commits] + [--pull-details] [--labels] [--hooks] [--milestones] [--security-advisories] [--discussions] [--repositories] [--bare] [--no-prune] [--lfs] [--wikis] [--gists] [--starred-gists] @@ -97,6 +98,7 @@ CLI Help output:: --issue-events include issue events in backup --pulls include pull requests in backup --pull-comments include pull request review comments in backup + --pull-reviews include pull request reviews in backup --pull-commits include pull request commits in backup --pull-details include more pull request details in backup [*] --labels include labels in backup @@ -340,6 +342,14 @@ For finer control, avoid using ``--assets`` with starred repos, or use ``--skip- Alternatively, consider just storing links to starred repos in JSON format with ``--starred``. +About pull request reviews +-------------------------- + +Use ``--pull-reviews`` with ``--pulls`` to include GitHub pull request review metadata under each pull request's ``review_data`` key. Reviews are separate from review comments: ``--pull-comments`` backs up inline review comments via ``comment_data`` and regular PR conversation comments via ``comment_regular_data``, while ``--pull-reviews`` backs up review state, submitted time, commit ID, and the top-level review body. + +``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing repository checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added. + + Incremental Backup ------------------ @@ -431,14 +441,14 @@ Quietly and incrementally backup useful Github user data (public and private rep export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER - github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER + github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER Debug an error/block or incomplete backup into a temporary directory. Omit "incremental" to fill a previous incomplete backup. :: export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER - github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER + github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER Pipe a token from stdin to avoid storing it in environment variables or command history (Unix-like systems only):: diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index c1245bd..054d0c6 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -293,6 +293,12 @@ def parse_args(args=None): dest="include_pull_comments", help="include pull request review comments in backup", ) + parser.add_argument( + "--pull-reviews", + action="store_true", + dest="include_pull_reviews", + help="include pull request reviews in backup", + ) parser.add_argument( "--pull-commits", action="store_true", @@ -2427,6 +2433,57 @@ def backup_issues(args, repo_cwd, repository, repos_template): os.replace(issue_file + ".temp", issue_file) # Atomic write +PULL_OPTIONAL_DATA_KEYS = ( + "comment_regular_data", + "comment_data", + "commit_data", + "review_data", +) +PULL_REVIEWS_LAST_UPDATE_FILENAME = "reviews_last_update" + + +def read_json_file_if_exists(path): + if not os.path.isfile(path): + return None + + try: + with codecs.open(path, "r", encoding="utf-8") as f: + return json.load(f) + except (OSError, UnicodeDecodeError, json.decoder.JSONDecodeError) as e: + logger.debug("Error reading existing JSON file {0}: {1}".format(path, e)) + return None + + +def restore_existing_pull_optional_data(pull, existing_pull): + if not existing_pull: + return + + for key in PULL_OPTIONAL_DATA_KEYS: + if key not in pull and key in existing_pull: + pull[key] = existing_pull[key] + + +def get_pull_reviews_since(args, pulls_cwd): + args_since = getattr(args, "since", None) + if not args.incremental: + return args_since, None, None + + reviews_last_update_path = os.path.join( + pulls_cwd, PULL_REVIEWS_LAST_UPDATE_FILENAME + ) + if not os.path.exists(reviews_last_update_path): + # One-time backfill for existing incremental backups: if the user adds + # --pull-reviews after a repository checkpoint already exists, the + # repository-level checkpoint would otherwise skip old PRs forever. + return None, None, reviews_last_update_path + + reviews_since = open(reviews_last_update_path).read().strip() + if args_since and reviews_since: + return min(args_since, reviews_since), reviews_since, reviews_last_update_path + + return args_since or reviews_since, reviews_since, reviews_last_update_path + + def backup_pulls(args, repo_cwd, repository, repos_template): has_pulls_dir = os.path.isdir("{0}/pulls/.git".format(repo_cwd)) if args.skip_existing and has_pulls_dir: @@ -2436,7 +2493,20 @@ def backup_pulls(args, repo_cwd, repository, repos_template): pulls_cwd = os.path.join(repo_cwd, "pulls") mkdir_p(repo_cwd, pulls_cwd) + include_pull_reviews = args.include_pull_reviews or args.include_everything + repository_since = getattr(args, "since", None) + pulls_since = repository_since + pull_reviews_since = None + pull_reviews_last_update_path = None + if include_pull_reviews: + ( + pulls_since, + pull_reviews_since, + pull_reviews_last_update_path, + ) = get_pull_reviews_since(args, pulls_cwd) + pulls = {} + newest_pull_update = None _pulls_template = "{0}/{1}/pulls".format(repos_template, repository["full_name"]) _issue_template = "{0}/{1}/issues".format(repos_template, repository["full_name"]) query_args = { @@ -2446,27 +2516,43 @@ def backup_pulls(args, repo_cwd, repository, repos_template): "direction": "desc", } + def track_newest_pull_update(pull): + nonlocal newest_pull_update + updated_at = pull.get("updated_at") + if updated_at and ( + newest_pull_update is None or updated_at > newest_pull_update + ): + newest_pull_update = updated_at + + def pull_is_due_for_repository_checkpoint(pull): + return not repository_since or pull["updated_at"] >= repository_since + if not args.include_pull_details: pull_states = ["open", "closed"] for pull_state in pull_states: query_args["state"] = pull_state _pulls = retrieve_data(args, _pulls_template, query_args=query_args) for pull in _pulls: - if args.since and pull["updated_at"] < args.since: + track_newest_pull_update(pull) + if pulls_since and pull["updated_at"] < pulls_since: break - if not args.since or pull["updated_at"] >= args.since: + if not pulls_since or pull["updated_at"] >= pulls_since: pulls[pull["number"]] = pull else: _pulls = retrieve_data(args, _pulls_template, query_args=query_args) for pull in _pulls: - if args.since and pull["updated_at"] < args.since: + track_newest_pull_update(pull) + if pulls_since and pull["updated_at"] < pulls_since: break - if not args.since or pull["updated_at"] >= args.since: - pulls[pull["number"]] = retrieve_data( - args, - _pulls_template + "/{}".format(pull["number"]), - paginated=False, - )[0] + if not pulls_since or pull["updated_at"] >= pulls_since: + if pull_is_due_for_repository_checkpoint(pull): + pulls[pull["number"]] = retrieve_data( + args, + _pulls_template + "/{}".format(pull["number"]), + paginated=False, + )[0] + else: + pulls[pull["number"]] = pull logger.info("Saving {0} pull requests to disk".format(len(list(pulls.keys())))) # Comments from pulls API are only _review_ comments @@ -2476,24 +2562,50 @@ def backup_pulls(args, repo_cwd, repository, repos_template): comments_regular_template = _issue_template + "/{0}/comments" comments_template = _pulls_template + "/{0}/comments" commits_template = _pulls_template + "/{0}/commits" + reviews_template = _pulls_template + "/{0}/reviews" + pull_review_errors = False + for number, pull in list(pulls.items()): pull_file = "{0}/{1}.json".format(pulls_cwd, number) + existing_pull = read_json_file_if_exists(pull_file) + needs_review_backfill = ( + include_pull_reviews + and (not existing_pull or "review_data" not in existing_pull) + ) + if args.incremental_by_files and os.path.isfile(pull_file): modified = os.path.getmtime(pull_file) modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ") - if modified > pull["updated_at"]: + if modified > pull["updated_at"] and not needs_review_backfill: logger.info( "Skipping pull request {0} because it wasn't modified since last backup".format( number ) ) continue - if args.include_pull_comments or args.include_everything: + + should_fetch_non_review_data = pull_is_due_for_repository_checkpoint(pull) + if ( + args.include_pull_comments or args.include_everything + ) and should_fetch_non_review_data: template = comments_regular_template.format(number) pulls[number]["comment_regular_data"] = retrieve_data(args, template) template = comments_template.format(number) pulls[number]["comment_data"] = retrieve_data(args, template) - if args.include_pull_commits or args.include_everything: + if include_pull_reviews: + template = reviews_template.format(number) + try: + pulls[number]["review_data"] = retrieve_data(args, template) + except Exception as e: + pull_review_errors = True + logger.warning( + "Unable to retrieve reviews for pull request {0}#{1}, skipping reviews: {2}".format( + repository["full_name"], number, e + ) + ) + if ( + args.include_pull_commits or args.include_everything + ) and should_fetch_non_review_data: template = commits_template.format(number) pulls[number]["commit_data"] = retrieve_data(args, template) if args.include_attachments: @@ -2501,10 +2613,22 @@ def backup_pulls(args, repo_cwd, repository, repos_template): args, pulls_cwd, pulls[number], number, repository, item_type="pull" ) + restore_existing_pull_optional_data(pull, existing_pull) + with codecs.open(pull_file + ".temp", "w", encoding="utf-8") as f: json_dump(pull, f) os.replace(pull_file + ".temp", pull_file) # Atomic write + if ( + include_pull_reviews + and args.incremental + and pull_reviews_last_update_path + and newest_pull_update + and not pull_review_errors + and (not pull_reviews_since or newest_pull_update > pull_reviews_since) + ): + open(pull_reviews_last_update_path, "w").write(newest_pull_update) + def backup_milestones(args, repo_cwd, repository, repos_template): milestone_cwd = os.path.join(repo_cwd, "milestones") diff --git a/tests/test_pull_reviews.py b/tests/test_pull_reviews.py new file mode 100644 index 0000000..6130269 --- /dev/null +++ b/tests/test_pull_reviews.py @@ -0,0 +1,237 @@ +"""Tests for pull request review backups.""" + +import json +import os + +from github_backup import github_backup + + +def test_parse_args_pull_reviews_flag(): + args = github_backup.parse_args(["--pull-reviews", "testuser"]) + assert args.include_pull_reviews is True + + +def test_backup_pulls_includes_review_data(create_args, tmp_path, monkeypatch): + args = create_args(include_pulls=True, include_pull_reviews=True) + repository = {"full_name": "owner/repo"} + calls = [] + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + calls.append((template, query_args)) + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2026-02-01T00:00:00Z", + "title": "Add feature", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [ + { + "id": 123, + "state": "APPROVED", + "body": "Looks good", + "submitted_at": "2026-02-01T00:00:00Z", + } + ] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(tmp_path / "pulls" / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [ + { + "body": "Looks good", + "id": 123, + "state": "APPROVED", + "submitted_at": "2026-02-01T00:00:00Z", + } + ] + assert ( + "https://api.github.com/repos/owner/repo/pulls/1/reviews", + None, + ) in calls + + +def test_pull_reviews_backfill_ignores_repository_checkpoint( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-01-01T00:00:00Z", + "title": "Old pull request", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "APPROVED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(tmp_path / "pulls" / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [{"id": 123, "state": "APPROVED"}] + assert (tmp_path / "pulls" / "reviews_last_update").read_text() == ( + "2025-01-01T00:00:00Z" + ) + + +def test_pull_reviews_uses_review_checkpoint_when_older_than_repository_checkpoint( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-06-01T00:00:00Z", + "title": "Review changed while feature was disabled", + }, + { + "number": 2, + "updated_at": "2024-12-01T00:00:00Z", + "title": "Too old", + }, + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "COMMENTED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert os.path.exists(tmp_path / "pulls" / "1.json") + assert not os.path.exists(tmp_path / "pulls" / "2.json") + assert (tmp_path / "pulls" / "reviews_last_update").read_text() == ( + "2025-06-01T00:00:00Z" + ) + + +def test_pull_reviews_preserves_existing_optional_pull_data( + create_args, tmp_path, monkeypatch +): + args = create_args(include_pulls=True, include_pull_reviews=True) + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + with open(pulls_dir / "1.json", "w", encoding="utf-8") as f: + json.dump( + { + "number": 1, + "updated_at": "2026-01-01T00:00:00Z", + "comment_data": [{"id": 10, "body": "inline comment"}], + "comment_regular_data": [{"id": 11, "body": "regular comment"}], + "commit_data": [{"sha": "abc"}], + }, + f, + ) + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2026-02-01T00:00:00Z", + "title": "Add reviews", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "APPROVED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(pulls_dir / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [{"id": 123, "state": "APPROVED"}] + assert pull["comment_data"] == [{"id": 10, "body": "inline comment"}] + assert pull["comment_regular_data"] == [{"id": 11, "body": "regular comment"}] + assert pull["commit_data"] == [{"sha": "abc"}] + + +def test_pull_reviews_does_not_advance_checkpoint_on_review_error( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-06-01T00:00:00Z", + "title": "Review retrieval fails", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + raise Exception("temporary API failure") + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert (pulls_dir / "reviews_last_update").read_text() == "2025-01-01T00:00:00Z" From b3a8241c9ab5930acfae2014d6a48a4feabe95ae Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 15:03:48 +0200 Subject: [PATCH 14/35] Implement per-resource last_update timestamps Closes #62 --- CHANGES.rst | 5 + README.rst | 12 +- github_backup/github_backup.py | 167 +++++++++++++++++--- tests/test_incremental_per_repository.py | 189 +++++++++++++++++++++++ 4 files changed, 348 insertions(+), 25 deletions(-) create mode 100644 tests/test_incremental_per_repository.py diff --git a/CHANGES.rst b/CHANGES.rst index b790ce1..6cf9f17 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -7,6 +7,11 @@ Unreleased optional attachment downloads, and per-repository incremental checkpoints. - Add pull request review backups with ``--pull-reviews`` and one-time incremental backfill for existing backups. +- Store incremental ``last_update`` checkpoints per repository resource instead + of using one global checkpoint for the whole output directory. Existing + backups use the legacy global checkpoint as a migration fallback, and the + legacy file is removed once existing issue/pull backups have resource + checkpoints (#62). - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/README.rst b/README.rst index 52d7222..3a4be3b 100644 --- a/README.rst +++ b/README.rst @@ -347,15 +347,19 @@ About pull request reviews Use ``--pull-reviews`` with ``--pulls`` to include GitHub pull request review metadata under each pull request's ``review_data`` key. Reviews are separate from review comments: ``--pull-comments`` backs up inline review comments via ``comment_data`` and regular PR conversation comments via ``comment_regular_data``, while ``--pull-reviews`` backs up review state, submitted time, commit ID, and the top-level review body. -``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing repository checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added. +``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing pull request checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added. Incremental Backup ------------------ -Using (``-i, --incremental``) will only request new data from the API **since the last run (successful or not)**. e.g. only request issues from the API since the last run. +Using (``-i, --incremental``) will only request new data from the API **since the last successful resource backup**. e.g. only request issues from the API since the last issue backup for that repository. -This means any blocking errors on previous runs can cause a large amount of missing data in backups. +Incremental checkpoints for issue and pull request API backups are stored per resource in that repository's backup directory (for example ``repositories/{repo}/issues/last_update``, ``repositories/{repo}/pulls/last_update`` or ``starred/{owner}/{repo}/pulls/last_update``). Older versions stored a single global ``last_update`` file in the output directory root. During migration, the legacy global checkpoint is used as a fallback only for resource directories that already contain backup data but do not yet have their own checkpoint. New repositories or newly enabled resources with no existing data get a full backup instead of inheriting an unrelated global checkpoint. + +After all existing issue and pull request resource directories have per-resource checkpoints, the legacy global ``last_update`` file is removed automatically. + +This means any blocking errors on previous runs can cause missing data in backups for the affected repository resource. Using (``--incremental-by-files``) will request new data from the API **based on when the file was modified on filesystem**. e.g. if you modify the file yourself you may miss something. @@ -368,7 +372,7 @@ Known blocking errors Some errors will block the backup run by exiting the script. e.g. receiving a 403 Forbidden error from the Github API. -If the incremental argument is used, this will result in the next backup only requesting API data since the last blocked/failed run. Potentially causing unexpected large amounts of missing data. +If the incremental argument is used, per-resource checkpoints are only advanced after that resource's backup work completes. A blocking error can still abort the overall run, but repositories and resources that were not processed will keep their previous checkpoints. It's therefore recommended to only use the incremental argument if the output/result is being actively monitored, or complimented with periodic full non-incremental runs, to avoid unexpected missing data in a regular backup runs. diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 054d0c6..e56bb28 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -1928,26 +1928,138 @@ def filter_repositories(args, unfiltered_repositories): return repositories +INCREMENTAL_LAST_UPDATE_FILENAME = "last_update" +INCREMENTAL_RESOURCE_DIRECTORIES = ("issues", "pulls") + + +def get_repository_checkpoint_time(repository): + timestamps = [ + timestamp + for timestamp in (repository.get("updated_at"), repository.get("pushed_at")) + if timestamp + ] + if timestamps: + return max(timestamps) + + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) + + +def resource_backup_exists(resource_cwd): + if not os.path.isdir(resource_cwd): + return False + + ignored_names = { + INCREMENTAL_LAST_UPDATE_FILENAME, + PULL_REVIEWS_LAST_UPDATE_FILENAME, + } + for name in os.listdir(resource_cwd): + if name in ignored_names or name.endswith(".temp"): + continue + return True + + return False + + +def read_legacy_last_update(args, output_directory): + if not args.incremental: + return None, None + + last_update_path = os.path.join(output_directory, INCREMENTAL_LAST_UPDATE_FILENAME) + if os.path.exists(last_update_path): + return last_update_path, open(last_update_path).read().strip() + + return last_update_path, None + + +def read_resource_last_update(args, resource_cwd, legacy_last_update=None): + if not args.incremental: + return None + + last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME) + if os.path.exists(last_update_path): + return open(last_update_path).read().strip() + + if legacy_last_update and resource_backup_exists(resource_cwd): + return legacy_last_update + + return None + + +def write_resource_last_update(args, resource_cwd, repository): + if not args.incremental: + return + + mkdir_p(resource_cwd) + last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME) + open(last_update_path, "w").write(get_repository_checkpoint_time(repository)) + + +def iter_incremental_resource_dirs(output_directory): + repositories_dir = os.path.join(output_directory, "repositories") + if os.path.isdir(repositories_dir): + for repository_name in os.listdir(repositories_dir): + repo_cwd = os.path.join(repositories_dir, repository_name) + if not os.path.isdir(repo_cwd): + continue + for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES: + yield os.path.join(repo_cwd, resource_name) + + starred_dir = os.path.join(output_directory, "starred") + if os.path.isdir(starred_dir): + for owner_name in os.listdir(starred_dir): + owner_cwd = os.path.join(starred_dir, owner_name) + if not os.path.isdir(owner_cwd): + continue + for repository_name in os.listdir(owner_cwd): + repo_cwd = os.path.join(owner_cwd, repository_name) + if not os.path.isdir(repo_cwd): + continue + for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES: + yield os.path.join(repo_cwd, resource_name) + + +def has_unmigrated_incremental_resources(output_directory): + for resource_cwd in iter_incremental_resource_dirs(output_directory): + last_update_path = os.path.join( + resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME + ) + if resource_backup_exists(resource_cwd) and not os.path.exists( + last_update_path + ): + return True + + return False + + +def remove_legacy_last_update_if_migrated( + args, output_directory, legacy_last_update_path +): + if not args.incremental or not legacy_last_update_path: + return + if not os.path.exists(legacy_last_update_path): + return + if has_unmigrated_incremental_resources(output_directory): + logger.info( + "Keeping legacy global last_update until all existing issue/pull " + "backups have per-resource checkpoints" + ) + return + + os.remove(legacy_last_update_path) + logger.info( + "Removed legacy global last_update after migrating incremental checkpoints" + ) + + def backup_repositories(args, output_directory, repositories): logger.info("Backing up repositories") repos_template = "https://{0}/repos".format(get_github_api_host(args)) + legacy_last_update_path, legacy_last_update = read_legacy_last_update( + args, output_directory + ) + incremental_resource_work_attempted = False - if args.incremental: - last_update_path = os.path.join(output_directory, "last_update") - if os.path.exists(last_update_path): - args.since = open(last_update_path).read().strip() - else: - args.since = None - else: - args.since = None - - last_update = "0000-00-00T00:00:00Z" for repository in repositories: - if repository.get("updated_at") and repository["updated_at"] > last_update: - last_update = repository["updated_at"] - elif repository.get("pushed_at") and repository["pushed_at"] > last_update: - last_update = repository["pushed_at"] - if repository.get("is_gist"): repo_cwd = os.path.join(output_directory, "gists", repository["id"]) elif repository.get("is_starred"): @@ -2010,10 +2122,22 @@ def backup_repositories(args, output_directory, repositories): no_prune=args.no_prune, ) if args.include_issues or args.include_everything: + incremental_resource_work_attempted = True + issue_cwd = os.path.join(repo_cwd, "issues") + args.since = read_resource_last_update( + args, issue_cwd, legacy_last_update + ) backup_issues(args, repo_cwd, repository, repos_template) + write_resource_last_update(args, issue_cwd, repository) if args.include_pulls or args.include_everything: + incremental_resource_work_attempted = True + pulls_cwd = os.path.join(repo_cwd, "pulls") + args.since = read_resource_last_update( + args, pulls_cwd, legacy_last_update + ) backup_pulls(args, repo_cwd, repository, repos_template) + write_resource_last_update(args, pulls_cwd, repository) if args.include_discussions or args.include_everything: backup_discussions(args, repo_cwd, repository) @@ -2021,7 +2145,9 @@ def backup_repositories(args, output_directory, repositories): if args.include_milestones or args.include_everything: backup_milestones(args, repo_cwd, repository, repos_template) - if args.include_security_advisories or (args.include_everything and not repository.get("private", False)): + if args.include_security_advisories or ( + args.include_everything and not repository.get("private", False) + ): backup_security_advisories(args, repo_cwd, repository, repos_template) if args.include_labels or args.include_everything: @@ -2045,11 +2171,10 @@ def backup_repositories(args, output_directory, repositories): logger.info(f"Skipping remaining resources for {repository['full_name']}") continue - if args.incremental: - if last_update == "0000-00-00T00:00:00Z": - last_update = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) - - open(last_update_path, "w").write(last_update) + if incremental_resource_work_attempted: + remove_legacy_last_update_if_migrated( + args, output_directory, legacy_last_update_path + ) def _repository_owner_name(repository): diff --git a/tests/test_incremental_per_repository.py b/tests/test_incremental_per_repository.py new file mode 100644 index 0000000..f1fd67a --- /dev/null +++ b/tests/test_incremental_per_repository.py @@ -0,0 +1,189 @@ +"""Tests for per-resource incremental checkpoints.""" + +import json +import os + +from github_backup import github_backup + + +def _repo(name, updated_at, pushed_at=None): + return { + "name": name, + "full_name": "owner/{0}".format(name), + "owner": {"login": "owner"}, + "clone_url": "https://github.com/owner/{0}.git".format(name), + "private": False, + "fork": False, + "has_wiki": False, + "updated_at": updated_at, + "pushed_at": pushed_at, + } + + +def test_incremental_uses_per_resource_last_update( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repositories = [ + _repo("repo-one", "2026-02-01T00:00:00Z"), + _repo("repo-two", "2026-03-01T00:00:00Z"), + ] + repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues" + repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues" + repo_one_issues.mkdir(parents=True) + repo_two_issues.mkdir(parents=True) + (repo_one_issues / "last_update").write_text("2026-01-01T00:00:00Z") + (repo_two_issues / "last_update").write_text("2025-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append((repository["name"], passed_args.since)) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, repositories) + + assert seen_since == [ + ("repo-one", "2026-01-01T00:00:00Z"), + ("repo-two", "2025-01-01T00:00:00Z"), + ] + assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert (repo_two_issues / "last_update").read_text() == "2026-03-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_uses_independent_issue_and_pull_checkpoints( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True, include_pulls=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + repo_dir = tmp_path / "repositories" / "repo-one" + issues_dir = repo_dir / "issues" + pulls_dir = repo_dir / "pulls" + issues_dir.mkdir(parents=True) + pulls_dir.mkdir(parents=True) + (issues_dir / "last_update").write_text("2026-01-01T00:00:00Z") + (pulls_dir / "last_update").write_text("2025-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(("issues", passed_args.since)) + + def fake_backup_pulls(passed_args, repo_cwd, repository, repos_template): + seen_since.append(("pulls", passed_args.since)) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + monkeypatch.setattr(github_backup, "backup_pulls", fake_backup_pulls) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == [ + ("issues", "2026-01-01T00:00:00Z"), + ("pulls", "2025-01-01T00:00:00Z"), + ] + assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert (pulls_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + + +def test_incremental_uses_legacy_global_last_update_for_existing_resource_backup( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + issues_dir = tmp_path / "repositories" / "repo-one" / "issues" + issues_dir.mkdir(parents=True) + with open(issues_dir / "1.json", "w", encoding="utf-8") as f: + json.dump({"number": 1}, f) + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(passed_args.since) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == ["2026-01-01T00:00:00Z"] + assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_does_not_use_legacy_global_last_update_for_new_resource_backup( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2099-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(passed_args.since) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == [None] + assert ( + tmp_path / "repositories" / "repo-one" / "issues" / "last_update" + ).read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_keeps_legacy_global_last_update_until_all_existing_resources_migrated( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues" + repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues" + repo_one_issues.mkdir(parents=True) + repo_two_issues.mkdir(parents=True) + with open(repo_one_issues / "1.json", "w", encoding="utf-8") as f: + json.dump({"number": 1}, f) + with open(repo_two_issues / "2.json", "w", encoding="utf-8") as f: + json.dump({"number": 2}, f) + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + pass + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(repo_two_issues / "last_update") + assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z" + + +def test_incremental_does_not_remove_legacy_checkpoint_without_resource_work( + create_args, tmp_path +): + args = create_args(incremental=True, include_repository=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z" + assert not os.path.exists( + tmp_path / "repositories" / "repo-one" / "issues" / "last_update" + ) + + +def test_repository_checkpoint_time_uses_newest_available_repo_timestamp(): + repository = _repo( + "repo-one", + updated_at="2026-02-01T00:00:00Z", + pushed_at="2026-03-01T00:00:00Z", + ) + + assert github_backup.get_repository_checkpoint_time(repository) == ( + "2026-03-01T00:00:00Z" + ) From 6cd0ab3633df812ab586968b5b2e448e0e1b3efc Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 15:15:22 +0200 Subject: [PATCH 15/35] Reduce unnecessary pull requests with incremental fetching --- CHANGES.rst | 2 + github_backup/github_backup.py | 18 +++-- tests/test_pull_incremental_pagination.py | 85 +++++++++++++++++++++++ tests/test_pull_reviews.py | 10 +-- 4 files changed, 104 insertions(+), 11 deletions(-) create mode 100644 tests/test_pull_incremental_pagination.py diff --git a/CHANGES.rst b/CHANGES.rst index 6cf9f17..8b62d33 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -12,6 +12,8 @@ Unreleased backups use the legacy global checkpoint as a migration fallback, and the legacy file is removed once existing issue/pull backups have resource checkpoints (#62). +- Stop paginating pull requests during incremental backups once the sorted + results are older than the active checkpoint. - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index e56bb28..f83bdb3 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -717,11 +717,12 @@ def calculate_retry_delay(attempt, headers): return delay + random.uniform(0, delay * 0.1) -def retrieve_data(args, template, query_args=None, paginated=True): +def retrieve_data(args, template, query_args=None, paginated=True, lazy=False): """ Fetch the data from GitHub API. - Handle both single requests and pagination with yield of individual dicts. + Handle both single requests and pagination. Returns a list by default, or + a generator when lazy=True so callers can stop before fetching every page. Handles throttling, retries, read errors, and DMCA takedowns. """ query_args = query_args or {} @@ -851,6 +852,9 @@ def _extract_legal_url(response_body_bytes): ): break # No more data + if lazy: + return fetch_all() + return list(fetch_all()) @@ -2656,16 +2660,18 @@ def pull_is_due_for_repository_checkpoint(pull): pull_states = ["open", "closed"] for pull_state in pull_states: query_args["state"] = pull_state - _pulls = retrieve_data(args, _pulls_template, query_args=query_args) - for pull in _pulls: + for pull in retrieve_data( + args, _pulls_template, query_args=query_args, lazy=True + ): track_newest_pull_update(pull) if pulls_since and pull["updated_at"] < pulls_since: break if not pulls_since or pull["updated_at"] >= pulls_since: pulls[pull["number"]] = pull else: - _pulls = retrieve_data(args, _pulls_template, query_args=query_args) - for pull in _pulls: + for pull in retrieve_data( + args, _pulls_template, query_args=query_args, lazy=True + ): track_newest_pull_update(pull) if pulls_since and pull["updated_at"] < pulls_since: break diff --git a/tests/test_pull_incremental_pagination.py b/tests/test_pull_incremental_pagination.py new file mode 100644 index 0000000..11230b0 --- /dev/null +++ b/tests/test_pull_incremental_pagination.py @@ -0,0 +1,85 @@ +"""Tests for incremental pull request pagination.""" + +import json +import os +from unittest.mock import patch + +from github_backup import github_backup + + +class MockHTTPResponse: + def __init__(self, data, link_header=None): + self._content = json.dumps(data).encode("utf-8") + self._link_header = link_header + self._read = False + self.reason = "OK" + + def getcode(self): + return 200 + + def read(self): + if self._read: + return b"" + self._read = True + return self._content + + @property + def headers(self): + headers = {"x-ratelimit-remaining": "5000"} + if self._link_header: + headers["Link"] = self._link_header + return headers + + +def test_backup_pulls_incremental_stops_before_fetching_old_pages( + create_args, tmp_path +): + args = create_args(include_pulls=True, incremental=True) + args.since = "2026-04-26T08:13:46Z" + repository = {"full_name": "owner/repo"} + + responses = [ + MockHTTPResponse([]), + MockHTTPResponse( + [ + { + "number": 2, + "title": "new pull", + "updated_at": "2026-04-26T09:00:00Z", + }, + { + "number": 1, + "title": "old pull", + "updated_at": "2026-04-26T07:00:00Z", + }, + ], + link_header='; rel="next"', + ), + MockHTTPResponse( + [ + { + "number": 0, + "title": "older pull on page 2", + "updated_at": "2026-04-25T07:00:00Z", + } + ] + ), + ] + requests_made = [] + + def mock_urlopen(request, *args, **kwargs): + requests_made.append(request.get_full_url()) + return responses[len(requests_made) - 1] + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert len(requests_made) == 2 + assert "state=open" in requests_made[0] + assert "state=closed" in requests_made[1] + assert all("page=2" not in url for url in requests_made) + assert os.path.exists(tmp_path / "pulls" / "2.json") + assert not os.path.exists(tmp_path / "pulls" / "1.json") + assert not os.path.exists(tmp_path / "pulls" / "0.json") diff --git a/tests/test_pull_reviews.py b/tests/test_pull_reviews.py index 6130269..2ce9ad1 100644 --- a/tests/test_pull_reviews.py +++ b/tests/test_pull_reviews.py @@ -16,7 +16,7 @@ def test_backup_pulls_includes_review_data(create_args, tmp_path, monkeypatch): repository = {"full_name": "owner/repo"} calls = [] - def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): calls.append((template, query_args)) if template == "https://api.github.com/repos/owner/repo/pulls": if query_args["state"] == "open": @@ -73,7 +73,7 @@ def test_pull_reviews_backfill_ignores_repository_checkpoint( args.since = "2026-01-01T00:00:00Z" repository = {"full_name": "owner/repo"} - def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): if template == "https://api.github.com/repos/owner/repo/pulls": if query_args["state"] == "open": return [ @@ -117,7 +117,7 @@ def test_pull_reviews_uses_review_checkpoint_when_older_than_repository_checkpoi pulls_dir.mkdir() (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") - def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): if template == "https://api.github.com/repos/owner/repo/pulls": if query_args["state"] == "open": return [ @@ -169,7 +169,7 @@ def test_pull_reviews_preserves_existing_optional_pull_data( f, ) - def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): if template == "https://api.github.com/repos/owner/repo/pulls": if query_args["state"] == "open": return [ @@ -213,7 +213,7 @@ def test_pull_reviews_does_not_advance_checkpoint_on_review_error( pulls_dir.mkdir() (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") - def fake_retrieve_data(passed_args, template, query_args=None, paginated=True): + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): if template == "https://api.github.com/repos/owner/repo/pulls": if query_args["state"] == "open": return [ From 9d0cfdb61da1cea97b381c2177ccc4e52e9a6352 Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 16:05:20 +0200 Subject: [PATCH 16/35] Avoid redundant release asset list requests --- CHANGES.rst | 2 + github_backup/github_backup.py | 7 ++- tests/test_releases.py | 95 ++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 tests/test_releases.py diff --git a/CHANGES.rst b/CHANGES.rst index 8b62d33..3d2ceb0 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -14,6 +14,8 @@ Unreleased checkpoints (#62). - Stop paginating pull requests during incremental backups once the sorted results are older than the active checkpoint. +- Avoid extra release asset list requests by using asset metadata already + included in GitHub's releases response. - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index f83bdb3..6edfb05 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -2919,7 +2919,12 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F written_count += 1 if include_assets and not skip_assets: - assets = retrieve_data(args, release["assets_url"]) + # The releases list API already includes release asset metadata. Use + # it to avoid an extra /releases/{id}/assets request per release. + # Keep a fallback for older/enterprise responses that might omit it. + assets = release.get("assets") + if assets is None: + assets = retrieve_data(args, release["assets_url"]) if len(assets) > 0: # give release asset files somewhere to live & download them (not including source archives) release_assets_cwd = os.path.join(release_cwd, release_name_safe) diff --git a/tests/test_releases.py b/tests/test_releases.py new file mode 100644 index 0000000..b8584f4 --- /dev/null +++ b/tests/test_releases.py @@ -0,0 +1,95 @@ +"""Tests for release backup behavior.""" + +from github_backup import github_backup + + +def test_backup_releases_uses_embedded_assets_without_extra_asset_list_request( + create_args, tmp_path, monkeypatch +): + args = create_args(include_releases=True, include_assets=True) + repository = {"full_name": "owner/repo", "name": "repo"} + calls = [] + downloads = [] + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): + calls.append(template) + if template == "https://api.github.com/repos/owner/repo/releases": + return [ + { + "tag_name": "v1.0.0", + "created_at": "2026-01-01T00:00:00Z", + "updated_at": "2026-01-01T00:00:00Z", + "prerelease": False, + "draft": False, + "assets_url": "https://api.github.com/repos/owner/repo/releases/1/assets", + "assets": [ + { + "name": "artifact.zip", + "url": "https://api.github.com/repos/owner/repo/releases/assets/1", + } + ], + } + ] + raise AssertionError("Unexpected API request: {0}".format(template)) + + def fake_download_file(url, path, auth, as_app=False, fine=False): + downloads.append((url, path)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + monkeypatch.setattr(github_backup, "download_file", fake_download_file) + + github_backup.backup_releases( + args, + tmp_path, + repository, + "https://api.github.com/repos", + include_assets=True, + ) + + assert calls == ["https://api.github.com/repos/owner/repo/releases"] + assert downloads == [ + ( + "https://api.github.com/repos/owner/repo/releases/assets/1", + str(tmp_path / "releases" / "v1.0.0" / "artifact.zip"), + ) + ] + + +def test_backup_releases_falls_back_to_assets_url_when_assets_missing( + create_args, tmp_path, monkeypatch +): + args = create_args(include_releases=True, include_assets=True) + repository = {"full_name": "owner/repo", "name": "repo"} + calls = [] + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): + calls.append(template) + if template == "https://api.github.com/repos/owner/repo/releases": + return [ + { + "tag_name": "v1.0.0", + "created_at": "2026-01-01T00:00:00Z", + "updated_at": "2026-01-01T00:00:00Z", + "prerelease": False, + "draft": False, + "assets_url": "https://api.github.com/repos/owner/repo/releases/1/assets", + } + ] + if template == "https://api.github.com/repos/owner/repo/releases/1/assets": + return [] + raise AssertionError("Unexpected API request: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_releases( + args, + tmp_path, + repository, + "https://api.github.com/repos", + include_assets=True, + ) + + assert calls == [ + "https://api.github.com/repos/owner/repo/releases", + "https://api.github.com/repos/owner/repo/releases/1/assets", + ] From 014eff395a999f82674547efd77a6470b038ce91 Mon Sep 17 00:00:00 2001 From: Duncan Ogilvie Date: Sun, 26 Apr 2026 16:09:42 +0200 Subject: [PATCH 17/35] Skip checkpoint-equal incremental items --- CHANGES.rst | 4 +- github_backup/github_backup.py | 12 +++--- tests/test_discussions.py | 35 +++++++++++++++++ tests/test_pull_incremental_pagination.py | 46 +++++++++++++++++++++++ 4 files changed, 90 insertions(+), 7 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 3d2ceb0..3d4cdce 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -13,7 +13,9 @@ Unreleased legacy file is removed once existing issue/pull backups have resource checkpoints (#62). - Stop paginating pull requests during incremental backups once the sorted - results are older than the active checkpoint. + results are at or older than the active checkpoint. +- Avoid re-fetching discussions and pull requests whose ``updated_at`` exactly + matches the active incremental checkpoint. - Avoid extra release asset list requests by using asset metadata already included in GitHub's releases response. - Add ``--token-from-gh`` to read authentication from ``gh auth token``. diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 6edfb05..ae4ef2e 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -2233,7 +2233,7 @@ def retrieve_discussion_summaries(args, repository, since=None): if updated_at and (newest_seen is None or updated_at > newest_seen): newest_seen = updated_at - if since and updated_at and updated_at < since: + if since and updated_at and updated_at <= since: stop = True break @@ -2654,7 +2654,7 @@ def track_newest_pull_update(pull): newest_pull_update = updated_at def pull_is_due_for_repository_checkpoint(pull): - return not repository_since or pull["updated_at"] >= repository_since + return not repository_since or pull["updated_at"] > repository_since if not args.include_pull_details: pull_states = ["open", "closed"] @@ -2664,18 +2664,18 @@ def pull_is_due_for_repository_checkpoint(pull): args, _pulls_template, query_args=query_args, lazy=True ): track_newest_pull_update(pull) - if pulls_since and pull["updated_at"] < pulls_since: + if pulls_since and pull["updated_at"] <= pulls_since: break - if not pulls_since or pull["updated_at"] >= pulls_since: + if not pulls_since or pull["updated_at"] > pulls_since: pulls[pull["number"]] = pull else: for pull in retrieve_data( args, _pulls_template, query_args=query_args, lazy=True ): track_newest_pull_update(pull) - if pulls_since and pull["updated_at"] < pulls_since: + if pulls_since and pull["updated_at"] <= pulls_since: break - if not pulls_since or pull["updated_at"] >= pulls_since: + if not pulls_since or pull["updated_at"] > pulls_since: if pull_is_due_for_repository_checkpoint(pull): pulls[pull["number"]] = retrieve_data( args, diff --git a/tests/test_discussions.py b/tests/test_discussions.py index 89fd8dd..2b5e3fb 100644 --- a/tests/test_discussions.py +++ b/tests/test_discussions.py @@ -50,6 +50,41 @@ def test_retrieve_discussion_summaries_stops_at_incremental_since(create_args): ) +def test_retrieve_discussion_summaries_excludes_checkpoint_timestamp(create_args): + args = create_args() + repository = {"full_name": "owner/repo"} + + page = { + "repository": { + "hasDiscussionsEnabled": True, + "discussions": { + "totalCount": 1, + "nodes": [ + { + "number": 1, + "title": "already backed up", + "updatedAt": "2026-01-01T00:00:00Z", + }, + ], + "pageInfo": {"hasNextPage": True, "endCursor": "NEXT"}, + }, + } + } + + with patch( + "github_backup.github_backup.retrieve_graphql_data", return_value=page + ) as mock_retrieve: + summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries( + args, repository, since="2026-01-01T00:00:00Z" + ) + + assert enabled is True + assert total == 1 + assert newest == "2026-01-01T00:00:00Z" + assert summaries == [] + assert mock_retrieve.call_count == 1 + + def test_retrieve_discussion_summaries_disabled_discussions(create_args): args = create_args() repository = {"full_name": "owner/repo"} diff --git a/tests/test_pull_incremental_pagination.py b/tests/test_pull_incremental_pagination.py index 11230b0..ac0f83f 100644 --- a/tests/test_pull_incremental_pagination.py +++ b/tests/test_pull_incremental_pagination.py @@ -31,6 +31,52 @@ def headers(self): return headers +def test_backup_pulls_incremental_excludes_checkpoint_timestamp(create_args, tmp_path): + args = create_args(include_pulls=True, incremental=True) + args.since = "2026-04-26T08:13:46Z" + repository = {"full_name": "owner/repo"} + + responses = [ + MockHTTPResponse([]), + MockHTTPResponse( + [ + { + "number": 1, + "title": "already backed up", + "updated_at": "2026-04-26T08:13:46Z", + }, + ], + link_header='; rel="next"', + ), + MockHTTPResponse( + [ + { + "number": 0, + "title": "older pull on page 2", + "updated_at": "2026-04-25T07:00:00Z", + } + ] + ), + ] + requests_made = [] + + def mock_urlopen(request, *args, **kwargs): + requests_made.append(request.get_full_url()) + return responses[len(requests_made) - 1] + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert len(requests_made) == 2 + assert "state=open" in requests_made[0] + assert "state=closed" in requests_made[1] + assert all("page=2" not in url for url in requests_made) + assert not os.path.exists(tmp_path / "pulls" / "1.json") + assert not os.path.exists(tmp_path / "pulls" / "0.json") + + def test_backup_pulls_incremental_stops_before_fetching_old_pages( create_args, tmp_path ): From f8cdf55050770bbcb1b5ba178d73b346988f0f89 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 29 Apr 2026 12:10:11 +0000 Subject: [PATCH 18/35] Release version 0.62.0 --- CHANGES.rst | 172 +++++++++++++++++++++++++++++++++----- github_backup/__init__.py | 2 +- 2 files changed, 154 insertions(+), 20 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 3d4cdce..86bcb32 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,29 +1,163 @@ Changelog ========= -Unreleased ----------- -- Add GitHub Discussions backups via GraphQL, including comments, replies, - optional attachment downloads, and per-repository incremental checkpoints. -- Add pull request review backups with ``--pull-reviews`` and one-time - incremental backfill for existing backups. -- Store incremental ``last_update`` checkpoints per repository resource instead - of using one global checkpoint for the whole output directory. Existing - backups use the legacy global checkpoint as a migration fallback, and the - legacy file is removed once existing issue/pull backups have resource - checkpoints (#62). -- Stop paginating pull requests during incremental backups once the sorted - results are at or older than the active checkpoint. -- Avoid re-fetching discussions and pull requests whose ``updated_at`` exactly - matches the active incremental checkpoint. -- Avoid extra release asset list requests by using asset metadata already - included in GitHub's releases response. -- Add ``--token-from-gh`` to read authentication from ``gh auth token``. +0.62.0 (2026-04-29) +------------------- +------------------------ +- Skip checkpoint-equal incremental items. [Duncan Ogilvie] +- Avoid redundant release asset list requests. [Duncan Ogilvie] +- Reduce unnecessary pull requests with incremental fetching. [Duncan + Ogilvie] +- Implement per-resource last_update timestamps. [Duncan Ogilvie] + + Closes #62 +- Add support for pull request reviews. [Duncan Ogilvie] + + Closes #124 +- Add support for discussions. [Duncan Ogilvie] + + Closes #290 +- Add --token-from-gh authentication option. [Duncan Ogilvie] +- Chore(deps): bump pytest in the python-packages group. + [dependabot[bot]] + + Bumps the python-packages group with 1 update: [pytest](https://github.com/pytest-dev/pytest). + + + Updates `pytest` from 9.0.2 to 9.0.3 + - [Release notes](https://github.com/pytest-dev/pytest/releases) + - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) + - [Commits](https://github.com/pytest-dev/pytest/compare/9.0.2...9.0.3) + + --- + updated-dependencies: + - dependency-name: pytest + dependency-version: 9.0.3 + dependency-type: direct:production + update-type: version-update:semver-patch + dependency-group: python-packages + ... +- Chore(deps): bump black in the python-packages group. + [dependabot[bot]] + + Bumps the python-packages group with 1 update: [black](https://github.com/psf/black). + + + Updates `black` from 26.3.0 to 26.3.1 + - [Release notes](https://github.com/psf/black/releases) + - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) + - [Commits](https://github.com/psf/black/compare/26.3.0...26.3.1) + + --- + updated-dependencies: + - dependency-name: black + dependency-version: 26.3.1 + dependency-type: direct:production + update-type: version-update:semver-patch + dependency-group: python-packages + ... +- Chore(deps): bump docker/login-action from 3 to 4. [dependabot[bot]] + + Bumps [docker/login-action](https://github.com/docker/login-action) from 3 to 4. + - [Release notes](https://github.com/docker/login-action/releases) + - [Commits](https://github.com/docker/login-action/compare/v3...v4) + + --- + updated-dependencies: + - dependency-name: docker/login-action + dependency-version: '4' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/setup-qemu-action from 3 to 4. + [dependabot[bot]] + + Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 3 to 4. + - [Release notes](https://github.com/docker/setup-qemu-action/releases) + - [Commits](https://github.com/docker/setup-qemu-action/compare/v3...v4) + + --- + updated-dependencies: + - dependency-name: docker/setup-qemu-action + dependency-version: '4' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/build-push-action from 6 to 7. + [dependabot[bot]] + + Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 6 to 7. + - [Release notes](https://github.com/docker/build-push-action/releases) + - [Commits](https://github.com/docker/build-push-action/compare/v6...v7) + + --- + updated-dependencies: + - dependency-name: docker/build-push-action + dependency-version: '7' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/setup-buildx-action from 3 to 4. + [dependabot[bot]] + + Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 3 to 4. + - [Release notes](https://github.com/docker/setup-buildx-action/releases) + - [Commits](https://github.com/docker/setup-buildx-action/compare/v3...v4) + + --- + updated-dependencies: + - dependency-name: docker/setup-buildx-action + dependency-version: '4' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/metadata-action from 5 to 6. + [dependabot[bot]] + + Bumps [docker/metadata-action](https://github.com/docker/metadata-action) from 5 to 6. + - [Release notes](https://github.com/docker/metadata-action/releases) + - [Commits](https://github.com/docker/metadata-action/compare/v5...v6) + + --- + updated-dependencies: + - dependency-name: docker/metadata-action + dependency-version: '6' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump the python-packages group with 2 updates. + [dependabot[bot]] + + Bumps the python-packages group with 2 updates: [black](https://github.com/psf/black) and [setuptools](https://github.com/pypa/setuptools). + + + Updates `black` from 26.1.0 to 26.3.0 + - [Release notes](https://github.com/psf/black/releases) + - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) + - [Commits](https://github.com/psf/black/compare/26.1.0...26.3.0) + + Updates `setuptools` from 82.0.0 to 82.0.1 + - [Release notes](https://github.com/pypa/setuptools/releases) + - [Changelog](https://github.com/pypa/setuptools/blob/main/NEWS.rst) + - [Commits](https://github.com/pypa/setuptools/compare/v82.0.0...v82.0.1) + + --- + updated-dependencies: + - dependency-name: black + dependency-version: 26.3.0 + dependency-type: direct:production + update-type: version-update:semver-minor + dependency-group: python-packages + - dependency-name: setuptools + dependency-version: 82.0.1 + dependency-type: direct:production + update-type: version-update:semver-patch + dependency-group: python-packages + ... 0.61.5 (2026-02-18) ------------------- ------------------------- - Fix empty repository crash due to None timestamp comparison (#489) [Rodos] diff --git a/github_backup/__init__.py b/github_backup/__init__.py index 294be4d..647040d 100644 --- a/github_backup/__init__.py +++ b/github_backup/__init__.py @@ -1 +1 @@ -__version__ = "0.61.5" +__version__ = "0.62.0" From 0638666bc7ebc9c55134648d0c4f3cb21932a680 Mon Sep 17 00:00:00 2001 From: Changaco Date: Fri, 10 Apr 2026 13:38:23 +0000 Subject: [PATCH 19/35] handle more network errors ```python-traceback Traceback (most recent call last): File ".local/bin/github-backup", line 6, in sys.exit(main()) ~~~~^^ File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/cli.py", line 83, in main backup_repositories(args, output_directory, repositories) ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 1845, in backup_repositories backup_pulls(args, repo_cwd, repository, repos_template) ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 2019, in backup_pulls pulls[number]["commit_data"] = retrieve_data(args, template) ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^ File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 766, in retrieve_data return list(fetch_all()) File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 717, in fetch_all response = json.loads(http_response.read().decode("utf-8")) ~~~~~~~~~~~~~~~~~~^^ File "/usr/lib/python3.14/http/client.py", line 500, in read s = self._safe_read(self.length) File "/usr/lib/python3.14/http/client.py", line 648, in _safe_read data = self.fp.read(cursize) File "/usr/lib/python3.14/socket.py", line 725, in readinto return self._sock.recv_into(b) ~~~~~~~~~~~~~~~~~~~~^^^ File "/usr/lib/python3.14/ssl.py", line 1304, in recv_into return self.read(nbytes, buffer) ~~~~~~~~~^^^^^^^^^^^^^^^^ File "/usr/lib/python3.14/ssl.py", line 1138, in read return self._sslobj.read(len, buffer) ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^ ConnectionResetError: [Errno 104] Connection reset by peer ``` --- github_backup/github_backup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index ae4ef2e..73a8a75 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -806,6 +806,7 @@ def _extract_legal_url(response_body_bytes): response = json.loads(http_response.read().decode("utf-8")) break # Exit retry loop and handle the data returned except ( + ConnectionError, IncompleteRead, json.decoder.JSONDecodeError, TimeoutError, From ddf82f1115f7d635993aa44454fb58c034624272 Mon Sep 17 00:00:00 2001 From: Changaco Date: Fri, 10 Apr 2026 15:25:05 +0000 Subject: [PATCH 20/35] suppress output of call to `git lfs version` --- github_backup/github_backup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index ae4ef2e..317a803 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -1781,7 +1781,10 @@ def get_authenticated_user(args): def check_git_lfs_install(): - exit_code = subprocess.call(["git", "lfs", "version"]) + exit_code = subprocess.call( + ["git", "lfs", "version"], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) if exit_code != 0: raise Exception( "The argument --lfs requires you to have Git LFS installed.\nYou can get it from https://git-lfs.github.com." From ddf7f82e65e5e57f0d5c499ed6f56234cb686eb3 Mon Sep 17 00:00:00 2001 From: Changaco Date: Fri, 10 Apr 2026 13:46:44 +0000 Subject: [PATCH 21/35] add missing `context` argument to `urlopen` call --- github_backup/github_backup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index ae4ef2e..6670d2d 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -1297,7 +1297,7 @@ def get_jwt_signed_url_via_markdown_api(url, token, repo_context): request.add_header("Content-Type", "application/json") request.add_header("Accept", "application/vnd.github+json") - html = urlopen(request, timeout=30).read().decode("utf-8") + html = urlopen(request, context=https_ctx, timeout=30).read().decode("utf-8") # Parse JWT-signed URL from HTML response # Format: From 2f130ecd6692bf8bc6e51bade07b5f36e56181ff Mon Sep 17 00:00:00 2001 From: Changaco Date: Fri, 10 Apr 2026 13:54:13 +0000 Subject: [PATCH 22/35] remove bad invocation of the system shell --- github_backup/github_backup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 6670d2d..80689b8 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -2980,7 +2980,7 @@ def fetch_repository( masked_remote_url = mask_password(remote_url) initialized = subprocess.call( - "git ls-remote " + remote_url, stdout=FNULL, stderr=FNULL, shell=True + ["git", "ls-remote", remote_url], stdout=FNULL, stderr=FNULL ) if initialized == 128: if ".wiki.git" in remote_url: From b92aee6f114f98502fea616abeefbbe924229ff0 Mon Sep 17 00:00:00 2001 From: Changaco Date: Fri, 10 Apr 2026 15:12:13 +0000 Subject: [PATCH 23/35] use `subprocess.DEVNULL` instead of emulating it --- github_backup/github_backup.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 8b96622..990993b 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -40,7 +40,6 @@ DISCUSSION_REPLIES_QUERY, ) -FNULL = open(os.devnull, "w") FILE_URI_PREFIX = "file://" logger = logging.getLogger(__name__) @@ -529,19 +528,18 @@ def get_auth(args, encode=True, for_git_cli=False): if platform.system() != "Darwin": raise Exception("Keychain arguments are only supported on Mac OSX") try: - with open(os.devnull, "w") as devnull: - token = subprocess.check_output( - [ - "security", - "find-generic-password", - "-s", - args.osx_keychain_item_name, - "-a", - args.osx_keychain_item_account, - "-w", - ], - stderr=devnull, - ).strip() + token = subprocess.check_output( + [ + "security", + "find-generic-password", + "-s", + args.osx_keychain_item_name, + "-a", + args.osx_keychain_item_account, + "-w", + ], + stderr=subprocess.DEVNULL, + ).strip() token = token.decode("utf-8") auth = token + ":" + "x-oauth-basic" except subprocess.SubprocessError: @@ -2984,7 +2982,8 @@ def fetch_repository( masked_remote_url = mask_password(remote_url) initialized = subprocess.call( - ["git", "ls-remote", remote_url], stdout=FNULL, stderr=FNULL + ["git", "ls-remote", remote_url], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) if initialized == 128: if ".wiki.git" in remote_url: From f3eabf0bfe522b7749d693ceaa65c5de4f13d8bc Mon Sep 17 00:00:00 2001 From: Changaco Date: Fri, 10 Apr 2026 16:23:03 +0000 Subject: [PATCH 24/35] don't pass stdin when doing so can't do any good When the child process doesn't inherit stderr, it can't ask the user for input, so it shouldn't inherit stdin either. --- github_backup/github_backup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 990993b..b76322a 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -1781,7 +1781,7 @@ def get_authenticated_user(args): def check_git_lfs_install(): exit_code = subprocess.call( - ["git", "lfs", "version"], + ["git", "lfs", "version"], stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) if exit_code != 0: @@ -2982,7 +2982,7 @@ def fetch_repository( masked_remote_url = mask_password(remote_url) initialized = subprocess.call( - ["git", "ls-remote", remote_url], + ["git", "ls-remote", remote_url], stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) if initialized == 128: From ccc27b95f7203ec42bf695cc270317fdd73f4489 Mon Sep 17 00:00:00 2001 From: Changaco Date: Thu, 30 Apr 2026 10:46:46 +0000 Subject: [PATCH 25/35] remove legacy code in `mkdir_p` function --- github_backup/github_backup.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index b76322a..4c07808 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -6,7 +6,6 @@ import base64 import calendar import codecs -import errno import json import logging import os @@ -127,13 +126,7 @@ def check_io(): def mkdir_p(*args): for path in args: - try: - os.makedirs(path) - except OSError as exc: # Python >2.5 - if exc.errno == errno.EEXIST and os.path.isdir(path): - pass - else: - raise + os.makedirs(path, exist_ok=True) def mask_password(url, secret="*****"): From f1fca0f9b7379e02c3d0903daee9d1954d7009eb Mon Sep 17 00:00:00 2001 From: Changaco Date: Thu, 30 Apr 2026 10:53:40 +0000 Subject: [PATCH 26/35] don't leave files open --- github_backup/github_backup.py | 41 ++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 4c07808..e567d3e 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -624,7 +624,8 @@ def get_github_host(args): def read_file_contents(file_uri): - return open(file_uri[len(FILE_URI_PREFIX) :], "rt").readline().strip() + with open(file_uri[len(FILE_URI_PREFIX) :], "rt") as f: + return f.readline().strip() def read_token_from_gh_cli(args): @@ -1964,10 +1965,11 @@ def read_legacy_last_update(args, output_directory): return None, None last_update_path = os.path.join(output_directory, INCREMENTAL_LAST_UPDATE_FILENAME) - if os.path.exists(last_update_path): - return last_update_path, open(last_update_path).read().strip() - - return last_update_path, None + try: + with open(last_update_path) as f: + return last_update_path, f.read().strip() + except FileNotFoundError: + return last_update_path, None def read_resource_last_update(args, resource_cwd, legacy_last_update=None): @@ -1975,13 +1977,13 @@ def read_resource_last_update(args, resource_cwd, legacy_last_update=None): return None last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME) - if os.path.exists(last_update_path): - return open(last_update_path).read().strip() - - if legacy_last_update and resource_backup_exists(resource_cwd): - return legacy_last_update - - return None + try: + with open(last_update_path) as f: + return f.read().strip() + except FileNotFoundError: + if legacy_last_update and resource_backup_exists(resource_cwd): + return legacy_last_update + return None def write_resource_last_update(args, resource_cwd, repository): @@ -1990,7 +1992,8 @@ def write_resource_last_update(args, resource_cwd, repository): mkdir_p(resource_cwd) last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME) - open(last_update_path, "w").write(get_repository_checkpoint_time(repository)) + with open(last_update_path, "w") as f: + f.write(get_repository_checkpoint_time(repository)) def iter_incremental_resource_dirs(output_directory): @@ -2378,7 +2381,8 @@ def backup_discussions(args, repo_cwd, repository): discussions_since = None discussion_last_update_path = os.path.join(discussion_cwd, "last_update") if args.incremental and os.path.exists(discussion_last_update_path): - discussions_since = open(discussion_last_update_path).read().strip() + with open(discussion_last_update_path) as f: + discussions_since = f.read().strip() logger.info("Retrieving {0} discussions".format(repository["full_name"])) try: @@ -2464,7 +2468,8 @@ def backup_discussions(args, repo_cwd, repository): and newest_seen and (not discussions_since or newest_seen > discussions_since) ): - open(discussion_last_update_path, "w").write(newest_seen) + with open(discussion_last_update_path, "w") as f: + f.write(newest_seen) attempted_count = len(summaries) - skipped_count if not summaries: @@ -2601,7 +2606,8 @@ def get_pull_reviews_since(args, pulls_cwd): # repository-level checkpoint would otherwise skip old PRs forever. return None, None, reviews_last_update_path - reviews_since = open(reviews_last_update_path).read().strip() + with open(reviews_last_update_path) as f: + reviews_since = f.read().strip() if args_since and reviews_since: return min(args_since, reviews_since), reviews_since, reviews_last_update_path @@ -2753,7 +2759,8 @@ def pull_is_due_for_repository_checkpoint(pull): and not pull_review_errors and (not pull_reviews_since or newest_pull_update > pull_reviews_since) ): - open(pull_reviews_last_update_path, "w").write(newest_pull_update) + with open(pull_reviews_last_update_path, "w") as f: + f.write(newest_pull_update) def backup_milestones(args, repo_cwd, repository, repos_template): From 17b79fcbef880e529ab376090fbd193f102300ac Mon Sep 17 00:00:00 2001 From: Changaco Date: Thu, 30 Apr 2026 10:58:08 +0000 Subject: [PATCH 27/35] rename a function to match what it actually does --- github_backup/github_backup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index e567d3e..f4a94b9 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -545,7 +545,7 @@ def get_auth(args, encode=True, for_git_cli=False): ) elif args.token_fine: if args.token_fine.startswith(FILE_URI_PREFIX): - args.token_fine = read_file_contents(args.token_fine) + args.token_fine = read_first_line(args.token_fine) if args.token_fine.startswith("github_pat_"): auth = args.token_fine @@ -561,7 +561,7 @@ def get_auth(args, encode=True, for_git_cli=False): ) args.token_classic = read_token_from_gh_cli(args) elif args.token_classic.startswith(FILE_URI_PREFIX): - args.token_classic = read_file_contents(args.token_classic) + args.token_classic = read_first_line(args.token_classic) if not args.as_app: auth = args.token_classic + ":" + "x-oauth-basic" @@ -623,7 +623,7 @@ def get_github_host(args): return host -def read_file_contents(file_uri): +def read_first_line(file_uri): with open(file_uri[len(FILE_URI_PREFIX) :], "rt") as f: return f.readline().strip() From 3cda5a01fdf094ea33de7d3c02aa7cc60d553e9b Mon Sep 17 00:00:00 2001 From: Changaco Date: Fri, 10 Apr 2026 20:32:16 +0000 Subject: [PATCH 28/35] document that `--all` doesn't imply `--attachments` --- README.rst | 2 +- github_backup/github_backup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 3a4be3b..ed037fd 100644 --- a/README.rst +++ b/README.rst @@ -325,7 +325,7 @@ Gotchas / Known-issues All is not everything --------------------- -The ``--all`` argument does not include: cloning private repos (``-P, --private``), cloning forks (``-F, --fork``), cloning starred repositories (``--all-starred``), ``--pull-details``, cloning LFS repositories (``--lfs``), cloning gists (``--gists``) or cloning starred gist repos (``--starred-gists``). See examples for more. +The ``--all`` argument does not include: downloading attachments from issue and pull request comments (``--attachments``), cloning private repos (``-P, --private``), cloning forks (``-F, --fork``), cloning starred repositories (``--all-starred``), ``--pull-details``, cloning LFS repositories (``--lfs``), cloning gists (``--gists``) or cloning starred gist repos (``--starred-gists``). See examples for more. Starred repository size ----------------------- diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 8b96622..dc872c7 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -488,7 +488,7 @@ def parse_args(args=None): "--attachments", action="store_true", dest="include_attachments", - help="download user-attachments from issues, pull requests, and discussions", + help="download user-attachments from issues, pull requests, and discussions [*]", ) parser.add_argument( "--throttle-limit", From 543d76f24bc4eb808618e7a8b5ccbabea80fa700 Mon Sep 17 00:00:00 2001 From: Changaco Date: Fri, 10 Apr 2026 20:35:06 +0000 Subject: [PATCH 29/35] fix a typo in the README --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index ed037fd..e5f0f14 100644 --- a/README.rst +++ b/README.rst @@ -363,7 +363,7 @@ This means any blocking errors on previous runs can cause missing data in backup Using (``--incremental-by-files``) will request new data from the API **based on when the file was modified on filesystem**. e.g. if you modify the file yourself you may miss something. -Still saver than the previous version. +Still safer than the previous version. Specifically, issues and pull requests are handled like this. From 9340aa3aaada4c2d41aa8f9c1b6164f9ee9ed082 Mon Sep 17 00:00:00 2001 From: Changaco Date: Fri, 10 Apr 2026 20:35:47 +0000 Subject: [PATCH 30/35] try to clarify what `--incremental` actually does --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index e5f0f14..1bd3ff6 100644 --- a/README.rst +++ b/README.rst @@ -365,7 +365,7 @@ Using (``--incremental-by-files``) will request new data from the API **based on Still safer than the previous version. -Specifically, issues and pull requests are handled like this. +Incremental backup only changes how issue and pull request data is fetched. Known blocking errors --------------------- From a2391a550e45ff4882f006696599fcd408317781 Mon Sep 17 00:00:00 2001 From: Changaco Date: Fri, 10 Apr 2026 20:37:05 +0000 Subject: [PATCH 31/35] remove pointless and unsafe `export`s in examples --- README.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 1bd3ff6..33a89fb 100644 --- a/README.rst +++ b/README.rst @@ -429,12 +429,12 @@ Github Backup Examples Backup all repositories, including private ones using a classic token:: - export ACCESS_TOKEN=SOME-GITHUB-TOKEN + ACCESS_TOKEN=SOME-GITHUB-TOKEN github-backup WhiteHouse --token $ACCESS_TOKEN --organization --output-directory /tmp/white-house --repositories --private Use a fine-grained access token to backup a single organization repository with everything else (wiki, pull requests, comments, issues etc):: - export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN + FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN ORGANIZATION=docker REPO=cli # e.g. git@github.com:docker/cli.git @@ -442,14 +442,14 @@ Use a fine-grained access token to backup a single organization repository with Quietly and incrementally backup useful Github user data (public and private repos with SSH) including; all issues, pulls, all public starred repos and gists (omitting "hooks", "releases" and therefore "assets" to prevent blocking). *Great for a cron job.* :: - export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN + FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER Debug an error/block or incomplete backup into a temporary directory. Omit "incremental" to fill a previous incomplete backup. :: - export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN + FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER From d30d9bfe6034b174ae3839f7aa13f4ad2eff4dc3 Mon Sep 17 00:00:00 2001 From: Changaco Date: Fri, 10 Apr 2026 20:38:31 +0000 Subject: [PATCH 32/35] eliminate trailing spaces --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 33a89fb..c4d0fd0 100644 --- a/README.rst +++ b/README.rst @@ -22,7 +22,7 @@ Using PIP via PyPI:: Using PIP via Github (more likely the latest version):: pip install git+https://github.com/josegonzalez/python-github-backup.git#egg=github-backup - + *Install note for python newcomers:* Python scripts are unlikely to be included in your ``$PATH`` by default, this means it cannot be run directly in terminal with ``$ github-backup ...``, you can either add python's install path to your environments ``$PATH`` or call the script directly e.g. using ``$ ~/.local/bin/github-backup``.* @@ -249,7 +249,7 @@ Note: When you run github-backup, you will be asked whether you want to allow " Github Rate-limit and Throttling -------------------------------- -"github-backup" will automatically throttle itself based on feedback from the Github API. +"github-backup" will automatically throttle itself based on feedback from the Github API. Their API is usually rate-limited to 5000 calls per hour. The API will ask github-backup to pause until a specific time when the limit is reset again (at the start of the next hour). This continues until the backup is complete. @@ -446,7 +446,7 @@ Quietly and incrementally backup useful Github user data (public and private rep GH_USER=YOUR-GITHUB-USER github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER - + Debug an error/block or incomplete backup into a temporary directory. Omit "incremental" to fill a previous incomplete backup. :: FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN From 8e76089565d7822bd94816433c2509daee40f26b Mon Sep 17 00:00:00 2001 From: Changaco Date: Sat, 25 Apr 2026 07:07:24 +0000 Subject: [PATCH 33/35] document that nothing is saved by default --- README.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.rst b/README.rst index c4d0fd0..c3d5d5d 100644 --- a/README.rst +++ b/README.rst @@ -327,6 +327,11 @@ All is not everything The ``--all`` argument does not include: downloading attachments from issue and pull request comments (``--attachments``), cloning private repos (``-P, --private``), cloning forks (``-F, --fork``), cloning starred repositories (``--all-starred``), ``--pull-details``, cloning LFS repositories (``--lfs``), cloning gists (``--gists``) or cloning starred gist repos (``--starred-gists``). See examples for more. +Saves nothing if no arguments are passed +---------------------------------------- + +At least one argument like ``--all`` or ``--repositories`` is needed for github-backup to actually save data. Without relevant arguments, github-backup fetches some data from GitHub but doesn't put any of it into files. + Starred repository size ----------------------- From bd6eea02d5095a83d25f2d57202bb78c93be1cc2 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 30 Apr 2026 15:52:41 +0000 Subject: [PATCH 34/35] Release version 0.62.1 --- CHANGES.rst | 58 ++++++++++++++++++++++++++++++++++++++- github_backup/__init__.py | 2 +- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 86bcb32..20ac838 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,9 +1,65 @@ Changelog ========= -0.62.0 (2026-04-29) +0.62.1 (2026-04-30) ------------------- ------------------------ +- Document that nothing is saved by default. [Changaco] +- Eliminate trailing spaces. [Changaco] +- Remove pointless and unsafe `export`s in examples. [Changaco] +- Try to clarify what `--incremental` actually does. [Changaco] +- Fix a typo in the README. [Changaco] +- Document that `--all` doesn't imply `--attachments` [Changaco] +- Rename a function to match what it actually does. [Changaco] +- Don't leave files open. [Changaco] +- Remove legacy code in `mkdir_p` function. [Changaco] +- Don't pass stdin when doing so can't do any good. [Changaco] + + When the child process doesn't inherit stderr, it can't ask the user for input, so it shouldn't inherit stdin either. +- Use `subprocess.DEVNULL` instead of emulating it. [Changaco] +- Remove bad invocation of the system shell. [Changaco] +- Add missing `context` argument to `urlopen` call. [Changaco] +- Suppress output of call to `git lfs version` [Changaco] +- Handle more network errors. [Changaco] + + ```python-traceback + Traceback (most recent call last): + File ".local/bin/github-backup", line 6, in + sys.exit(main()) + ~~~~^^ + File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/cli.py", line 83, in main + backup_repositories(args, output_directory, repositories) + ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 1845, in backup_repositories + backup_pulls(args, repo_cwd, repository, repos_template) + ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 2019, in backup_pulls + pulls[number]["commit_data"] = retrieve_data(args, template) + ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^ + File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 766, in retrieve_data + return list(fetch_all()) + File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 717, in fetch_all + response = json.loads(http_response.read().decode("utf-8")) + ~~~~~~~~~~~~~~~~~~^^ + File "/usr/lib/python3.14/http/client.py", line 500, in read + s = self._safe_read(self.length) + File "/usr/lib/python3.14/http/client.py", line 648, in _safe_read + data = self.fp.read(cursize) + File "/usr/lib/python3.14/socket.py", line 725, in readinto + return self._sock.recv_into(b) + ~~~~~~~~~~~~~~~~~~~~^^^ + File "/usr/lib/python3.14/ssl.py", line 1304, in recv_into + return self.read(nbytes, buffer) + ~~~~~~~~~^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.14/ssl.py", line 1138, in read + return self._sslobj.read(len, buffer) + ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^ + ConnectionResetError: [Errno 104] Connection reset by peer + ``` + + +0.62.0 (2026-04-29) +------------------- - Skip checkpoint-equal incremental items. [Duncan Ogilvie] - Avoid redundant release asset list requests. [Duncan Ogilvie] - Reduce unnecessary pull requests with incremental fetching. [Duncan diff --git a/github_backup/__init__.py b/github_backup/__init__.py index 647040d..b7b61f3 100644 --- a/github_backup/__init__.py +++ b/github_backup/__init__.py @@ -1 +1 @@ -__version__ = "0.62.0" +__version__ = "0.62.1" From 2cbce1425cbb2a2f00ba7996f795415d2ede6c37 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 May 2026 22:45:36 +0000 Subject: [PATCH 35/35] chore(deps): bump black in the python-packages group Bumps the python-packages group with 1 update: [black](https://github.com/psf/black). Updates `black` from 26.3.1 to 26.5.1 - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/26.3.1...26.5.1) --- updated-dependencies: - dependency-name: black dependency-version: 26.5.1 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: python-packages ... Signed-off-by: dependabot[bot] --- release-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release-requirements.txt b/release-requirements.txt index ad8bc5c..117aeea 100644 --- a/release-requirements.txt +++ b/release-requirements.txt @@ -1,6 +1,6 @@ # Linting & Formatting autopep8==2.3.2 -black==26.3.1 +black==26.5.1 flake8==7.3.0 # Testing