diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index f367b99..4e5c89b 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -43,13 +43,13 @@ jobs: persist-credentials: false - name: Set up QEMU - uses: docker/setup-qemu-action@v3 + uses: docker/setup-qemu-action@v4 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@v4 - name: Log in to the Container registry - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} @@ -57,7 +57,7 @@ jobs: - name: Extract metadata (tags, labels) for Docker id: meta - uses: docker/metadata-action@v5 + uses: docker/metadata-action@v6 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | @@ -68,7 +68,7 @@ jobs: type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'main') }} - name: Build and push Docker image - uses: docker/build-push-action@v6 + uses: docker/build-push-action@v7 with: context: . push: true diff --git a/CHANGES.rst b/CHANGES.rst index a6a1c4d..20ac838 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,9 +1,405 @@ Changelog ========= -0.59.0 (2025-12-21) +0.62.1 (2026-04-30) ------------------- ------------------------ +- Document that nothing is saved by default. [Changaco] +- Eliminate trailing spaces. [Changaco] +- Remove pointless and unsafe `export`s in examples. [Changaco] +- Try to clarify what `--incremental` actually does. [Changaco] +- Fix a typo in the README. [Changaco] +- Document that `--all` doesn't imply `--attachments` [Changaco] +- Rename a function to match what it actually does. [Changaco] +- Don't leave files open. [Changaco] +- Remove legacy code in `mkdir_p` function. [Changaco] +- Don't pass stdin when doing so can't do any good. [Changaco] + + When the child process doesn't inherit stderr, it can't ask the user for input, so it shouldn't inherit stdin either. +- Use `subprocess.DEVNULL` instead of emulating it. [Changaco] +- Remove bad invocation of the system shell. [Changaco] +- Add missing `context` argument to `urlopen` call. [Changaco] +- Suppress output of call to `git lfs version` [Changaco] +- Handle more network errors. [Changaco] + + ```python-traceback + Traceback (most recent call last): + File ".local/bin/github-backup", line 6, in + sys.exit(main()) + ~~~~^^ + File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/cli.py", line 83, in main + backup_repositories(args, output_directory, repositories) + ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 1845, in backup_repositories + backup_pulls(args, repo_cwd, repository, repos_template) + ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 2019, in backup_pulls + pulls[number]["commit_data"] = retrieve_data(args, template) + ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^ + File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 766, in retrieve_data + return list(fetch_all()) + File ".local/share/pipx/venvs/github-backup/lib/python3.14/site-packages/github_backup/github_backup.py", line 717, in fetch_all + response = json.loads(http_response.read().decode("utf-8")) + ~~~~~~~~~~~~~~~~~~^^ + File "/usr/lib/python3.14/http/client.py", line 500, in read + s = self._safe_read(self.length) + File "/usr/lib/python3.14/http/client.py", line 648, in _safe_read + data = self.fp.read(cursize) + File "/usr/lib/python3.14/socket.py", line 725, in readinto + return self._sock.recv_into(b) + ~~~~~~~~~~~~~~~~~~~~^^^ + File "/usr/lib/python3.14/ssl.py", line 1304, in recv_into + return self.read(nbytes, buffer) + ~~~~~~~~~^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.14/ssl.py", line 1138, in read + return self._sslobj.read(len, buffer) + ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^ + ConnectionResetError: [Errno 104] Connection reset by peer + ``` + + +0.62.0 (2026-04-29) +------------------- +- Skip checkpoint-equal incremental items. [Duncan Ogilvie] +- Avoid redundant release asset list requests. [Duncan Ogilvie] +- Reduce unnecessary pull requests with incremental fetching. [Duncan + Ogilvie] +- Implement per-resource last_update timestamps. [Duncan Ogilvie] + + Closes #62 +- Add support for pull request reviews. [Duncan Ogilvie] + + Closes #124 +- Add support for discussions. [Duncan Ogilvie] + + Closes #290 +- Add --token-from-gh authentication option. [Duncan Ogilvie] +- Chore(deps): bump pytest in the python-packages group. + [dependabot[bot]] + + Bumps the python-packages group with 1 update: [pytest](https://github.com/pytest-dev/pytest). + + + Updates `pytest` from 9.0.2 to 9.0.3 + - [Release notes](https://github.com/pytest-dev/pytest/releases) + - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) + - [Commits](https://github.com/pytest-dev/pytest/compare/9.0.2...9.0.3) + + --- + updated-dependencies: + - dependency-name: pytest + dependency-version: 9.0.3 + dependency-type: direct:production + update-type: version-update:semver-patch + dependency-group: python-packages + ... +- Chore(deps): bump black in the python-packages group. + [dependabot[bot]] + + Bumps the python-packages group with 1 update: [black](https://github.com/psf/black). + + + Updates `black` from 26.3.0 to 26.3.1 + - [Release notes](https://github.com/psf/black/releases) + - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) + - [Commits](https://github.com/psf/black/compare/26.3.0...26.3.1) + + --- + updated-dependencies: + - dependency-name: black + dependency-version: 26.3.1 + dependency-type: direct:production + update-type: version-update:semver-patch + dependency-group: python-packages + ... +- Chore(deps): bump docker/login-action from 3 to 4. [dependabot[bot]] + + Bumps [docker/login-action](https://github.com/docker/login-action) from 3 to 4. + - [Release notes](https://github.com/docker/login-action/releases) + - [Commits](https://github.com/docker/login-action/compare/v3...v4) + + --- + updated-dependencies: + - dependency-name: docker/login-action + dependency-version: '4' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/setup-qemu-action from 3 to 4. + [dependabot[bot]] + + Bumps [docker/setup-qemu-action](https://github.com/docker/setup-qemu-action) from 3 to 4. + - [Release notes](https://github.com/docker/setup-qemu-action/releases) + - [Commits](https://github.com/docker/setup-qemu-action/compare/v3...v4) + + --- + updated-dependencies: + - dependency-name: docker/setup-qemu-action + dependency-version: '4' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/build-push-action from 6 to 7. + [dependabot[bot]] + + Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 6 to 7. + - [Release notes](https://github.com/docker/build-push-action/releases) + - [Commits](https://github.com/docker/build-push-action/compare/v6...v7) + + --- + updated-dependencies: + - dependency-name: docker/build-push-action + dependency-version: '7' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/setup-buildx-action from 3 to 4. + [dependabot[bot]] + + Bumps [docker/setup-buildx-action](https://github.com/docker/setup-buildx-action) from 3 to 4. + - [Release notes](https://github.com/docker/setup-buildx-action/releases) + - [Commits](https://github.com/docker/setup-buildx-action/compare/v3...v4) + + --- + updated-dependencies: + - dependency-name: docker/setup-buildx-action + dependency-version: '4' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump docker/metadata-action from 5 to 6. + [dependabot[bot]] + + Bumps [docker/metadata-action](https://github.com/docker/metadata-action) from 5 to 6. + - [Release notes](https://github.com/docker/metadata-action/releases) + - [Commits](https://github.com/docker/metadata-action/compare/v5...v6) + + --- + updated-dependencies: + - dependency-name: docker/metadata-action + dependency-version: '6' + dependency-type: direct:production + update-type: version-update:semver-major + ... +- Chore(deps): bump the python-packages group with 2 updates. + [dependabot[bot]] + + Bumps the python-packages group with 2 updates: [black](https://github.com/psf/black) and [setuptools](https://github.com/pypa/setuptools). + + + Updates `black` from 26.1.0 to 26.3.0 + - [Release notes](https://github.com/psf/black/releases) + - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) + - [Commits](https://github.com/psf/black/compare/26.1.0...26.3.0) + + Updates `setuptools` from 82.0.0 to 82.0.1 + - [Release notes](https://github.com/pypa/setuptools/releases) + - [Changelog](https://github.com/pypa/setuptools/blob/main/NEWS.rst) + - [Commits](https://github.com/pypa/setuptools/compare/v82.0.0...v82.0.1) + + --- + updated-dependencies: + - dependency-name: black + dependency-version: 26.3.0 + dependency-type: direct:production + update-type: version-update:semver-minor + dependency-group: python-packages + - dependency-name: setuptools + dependency-version: 82.0.1 + dependency-type: direct:production + update-type: version-update:semver-patch + dependency-group: python-packages + ... + + +0.61.5 (2026-02-18) +------------------- +- Fix empty repository crash due to None timestamp comparison (#489) + [Rodos] + + Empty repositories have None for pushed_at/updated_at, causing a + TypeError when compared to the last_update string. Use .get() with + truthiness check to skip None timestamps in incremental tracking. + + +0.61.4 (2026-02-16) +------------------- +- Fix HTTP 451 DMCA and 403 TOS handling regression (#487) [Rodos] + + The DMCA handling added in PR #454 had a bug: make_request_with_retry() + raises HTTPError before retrieve_data() could check the status code via + getcode(), making the case 451 handler dead code. This also affected + HTTP 403 TOS violations (e.g. jumoog/MagiskOnWSA). + + Fix by catching HTTPError in retrieve_data() and converting 451 and + blocked 403 responses (identified by "block" key in response body) to + RepositoryUnavailableError. Non-block 403s (permissions, scopes) still + propagate as HTTPError. Also handle RepositoryUnavailableError in + retrieve_repositories() for the --repository case. + + Rewrote tests to mock urlopen (not make_request_with_retry) to exercise + the real code path that was previously untested. + + Closes #487 +- Chore(deps): bump setuptools in the python-packages group. + [dependabot[bot]] + + Bumps the python-packages group with 1 update: [setuptools](https://github.com/pypa/setuptools). + + + Updates `setuptools` from 80.10.2 to 82.0.0 + - [Release notes](https://github.com/pypa/setuptools/releases) + - [Changelog](https://github.com/pypa/setuptools/blob/main/NEWS.rst) + - [Commits](https://github.com/pypa/setuptools/compare/v80.10.2...v82.0.0) + + --- + updated-dependencies: + - dependency-name: setuptools + dependency-version: 82.0.0 + dependency-type: direct:production + update-type: version-update:semver-major + dependency-group: python-packages + ... +- Chore(deps): bump setuptools in the python-packages group. + [dependabot[bot]] + + Bumps the python-packages group with 1 update: [setuptools](https://github.com/pypa/setuptools). + + + Updates `setuptools` from 80.10.1 to 80.10.2 + - [Release notes](https://github.com/pypa/setuptools/releases) + - [Changelog](https://github.com/pypa/setuptools/blob/main/NEWS.rst) + - [Commits](https://github.com/pypa/setuptools/compare/v80.10.1...v80.10.2) + + --- + updated-dependencies: + - dependency-name: setuptools + dependency-version: 80.10.2 + dependency-type: direct:production + update-type: version-update:semver-patch + dependency-group: python-packages + ... + + +0.61.3 (2026-01-24) +------------------- +- Fix KeyError: 'Private' when using --all flag (#481) [Rodos] + + The repository dictionary uses lowercase "private" key. Use .get() with + the correct case to match the pattern used elsewhere in the codebase. + + The bug only affects --all users since --security-advisories short-circuits + before the key access. +- Chore(deps): bump setuptools in the python-packages group. + [dependabot[bot]] + + Bumps the python-packages group with 1 update: [setuptools](https://github.com/pypa/setuptools). + + + Updates `setuptools` from 80.9.0 to 80.10.1 + - [Release notes](https://github.com/pypa/setuptools/releases) + - [Changelog](https://github.com/pypa/setuptools/blob/main/NEWS.rst) + - [Commits](https://github.com/pypa/setuptools/compare/v80.9.0...v80.10.1) + + --- + updated-dependencies: + - dependency-name: setuptools + dependency-version: 80.10.1 + dependency-type: direct:production + update-type: version-update:semver-minor + dependency-group: python-packages + ... + + +0.61.2 (2026-01-19) +------------------- + +Fix +~~~ +- Skip security advisories for private repos unless explicitly + requested. [Lukas Bestle] +- Handle 404 errors on security advisories. [Lukas Bestle] + +Other +~~~~~ +- Chore(deps): bump black in the python-packages group. + [dependabot[bot]] + + Bumps the python-packages group with 1 update: [black](https://github.com/psf/black). + + + Updates `black` from 25.12.0 to 26.1.0 + - [Release notes](https://github.com/psf/black/releases) + - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) + - [Commits](https://github.com/psf/black/compare/25.12.0...26.1.0) + + --- + updated-dependencies: + - dependency-name: black + dependency-version: 26.1.0 + dependency-type: direct:production + update-type: version-update:semver-major + dependency-group: python-packages + ... +- Docs: Explain security advisories in README. [Lukas Bestle] +- Feat: Only make security advisory dir if successful. [Lukas Bestle] + + Avoids empty directories for private repos + + +0.61.1 (2026-01-13) +------------------- +- Refactor test fixtures to use shared create_args helper. [Rodos] + + Uses the real parse_args() function to get CLI defaults, so when + new arguments are added they're automatically available to all tests. + + Changes: + - Add tests/conftest.py with create_args fixture + - Update 8 test files to use shared fixture + - Remove duplicate _create_mock_args methods + - Remove redundant @pytest.fixture mock_args definitions + + This eliminates the need to update multiple test files when + adding new CLI arguments. +- Fix fine-grained PAT attachment downloads for private repos (#477) + [Rodos] + + Fine-grained personal access tokens cannot download attachments from + private repositories directly due to a GitHub platform limitation. + + This adds a workaround for image attachments (/assets/ URLs) using + GitHub's Markdown API to convert URLs to JWT-signed URLs that can be + downloaded without authentication. + + Changes: + - Add get_jwt_signed_url_via_markdown_api() function + - Detect fine-grained token + private repo + /assets/ URL upfront + - Use JWT workaround for those cases, mark success with jwt_workaround flag + - Skip download with skipped_at when workaround fails + - Add startup warning when using --attachments with fine-grained tokens + - Document limitation in README (file attachments still fail) + - Add 6 unit tests for JWT workaround logic + + +0.61.0 (2026-01-12) +------------------- +- Docs: Add missing `--retries` argument to README. [Lukas Bestle] +- Test: Adapt tests to new argument. [Lukas Bestle] +- Feat: Backup of repository security advisories. [Lukas Bestle] + + +0.60.0 (2025-12-24) +------------------- +- Rm max_retries.py. [michaelmartinez] +- Readme. [michaelmartinez] +- Don't use a global variable, pass the args instead. [michaelmartinez] +- Readme, simplify the logic a bit. [michaelmartinez] +- Max_retries 5. [michaelmartinez] + + +0.59.0 (2025-12-21) +------------------- - Add --starred-skip-size-over flag to limit starred repo size (#108) [Rodos] diff --git a/README.rst b/README.rst index ffa80ac..c3d5d5d 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ github-backup |PyPI| |Python Versions| -The package can be used to backup an *entire* `Github `_ organization, repository or user account, including starred repos, issues and wikis in the most appropriate format (clones for wikis, json files for issues). +The package can be used to backup an *entire* `Github `_ organization, repository or user account, including starred repos, issues, discussions and wikis in the most appropriate format (clones for wikis, json files for issues and discussions). Requirements ============ @@ -22,7 +22,7 @@ Using PIP via PyPI:: Using PIP via Github (more likely the latest version):: pip install git+https://github.com/josegonzalez/python-github-backup.git#egg=github-backup - + *Install note for python newcomers:* Python scripts are unlikely to be included in your ``$PATH`` by default, this means it cannot be run directly in terminal with ``$ github-backup ...``, you can either add python's install path to your environments ``$PATH`` or call the script directly e.g. using ``$ ~/.local/bin/github-backup``.* @@ -36,16 +36,18 @@ Show the CLI help output:: CLI Help output:: - github-backup [-h] [-t TOKEN_CLASSIC] [-f TOKEN_FINE] [-q] [--as-app] - [-o OUTPUT_DIRECTORY] [-l LOG_LEVEL] [-i] + github-backup [-h] [-t TOKEN_CLASSIC] [-f TOKEN_FINE] [--token-from-gh] + [-q] [--as-app] [-o OUTPUT_DIRECTORY] [-l LOG_LEVEL] [-i] [--incremental-by-files] [--starred] [--all-starred] [--starred-skip-size-over MB] [--watched] [--followers] [--following] [--all] [--issues] [--issue-comments] [--issue-events] [--pulls] - [--pull-comments] [--pull-commits] [--pull-details] - [--labels] [--hooks] [--milestones] [--repositories] - [--bare] [--no-prune] [--lfs] [--wikis] [--gists] - [--starred-gists] [--skip-archived] [--skip-existing] + [--pull-comments] [--pull-reviews] [--pull-commits] + [--pull-details] + [--labels] [--hooks] [--milestones] [--security-advisories] + [--discussions] [--repositories] [--bare] [--no-prune] + [--lfs] [--wikis] [--gists] [--starred-gists] + [--skip-archived] [--skip-existing] [-L [LANGUAGES ...]] [-N NAME_REGEX] [-H GITHUB_HOST] [-O] [-R REPOSITORY] [-P] [-F] [--prefer-ssh] [-v] [--keychain-name OSX_KEYCHAIN_ITEM_NAME] @@ -55,7 +57,7 @@ CLI Help output:: [--skip-assets-on [SKIP_ASSETS_ON ...]] [--attachments] [--throttle-limit THROTTLE_LIMIT] [--throttle-pause THROTTLE_PAUSE] - [--exclude [EXCLUDE ...]] + [--exclude [EXCLUDE ...]] [--retries MAX_RETRIES] USER Backup a github account @@ -71,6 +73,7 @@ CLI Help output:: -f, --token-fine TOKEN_FINE fine-grained personal access token (github_pat_....), or path to token (file://...) + --token-from-gh read token from GitHub CLI (gh auth token) -q, --quiet supress log messages less severe than warning, e.g. info --as-app authenticate as github app instead of as a user. @@ -95,12 +98,16 @@ CLI Help output:: --issue-events include issue events in backup --pulls include pull requests in backup --pull-comments include pull request review comments in backup + --pull-reviews include pull request reviews in backup --pull-commits include pull request commits in backup --pull-details include more pull request details in backup [*] --labels include labels in backup --hooks include hooks in backup (works only when authenticated) --milestones include milestones in backup + --security-advisories + include security advisories in backup + --discussions include discussions in backup --repositories include repository clone in backup --bare clone bare repositories --no-prune disable prune option for git fetch @@ -141,8 +148,8 @@ CLI Help output:: applies if including releases --skip-assets-on [SKIP_ASSETS_ON ...] skip asset downloads for these repositories - --attachments download user-attachments from issues and pull - requests + --attachments download user-attachments from issues, pull requests, + and discussions --throttle-limit THROTTLE_LIMIT start throttling of GitHub API requests after this amount of API requests remain @@ -152,7 +159,8 @@ CLI Help output:: --throttle-limit to be set) --exclude [EXCLUDE ...] names of repositories to exclude - + --retries MAX_RETRIES + maximum number of retries for API calls (default: 5) Usage Details ============= @@ -168,6 +176,8 @@ The positional argument ``USER`` specifies the user or organization account you **Classic tokens** (``-t TOKEN``) are `slightly less secure `_ as they provide very coarse-grained permissions. +If you already authenticate with the `GitHub CLI `_, you can use ``--token-from-gh`` to read the token with ``gh auth token`` instead of passing a token directly. This avoids placing the token in shell history or process arguments. When ``--github-host`` is set, the token is read with ``gh auth token --hostname HOST``. + Fine Tokens ~~~~~~~~~~~ @@ -178,7 +188,7 @@ Customise the permissions for your use case, but for a personal account full bac **User permissions**: Read access to followers, starring, and watching. -**Repository permissions**: Read access to contents, issues, metadata, pull requests, and webhooks. +**Repository permissions**: Read access to contents, discussions, issues, metadata, pull requests, and webhooks. GitHub Apps @@ -239,7 +249,7 @@ Note: When you run github-backup, you will be asked whether you want to allow " Github Rate-limit and Throttling -------------------------------- -"github-backup" will automatically throttle itself based on feedback from the Github API. +"github-backup" will automatically throttle itself based on feedback from the Github API. Their API is usually rate-limited to 5000 calls per hour. The API will ask github-backup to pause until a specific time when the limit is reset again (at the start of the next hour). This continues until the backup is complete. @@ -259,9 +269,9 @@ LFS objects are fetched for all refs, not just the current checkout, ensuring a About Attachments ----------------- -When you use the ``--attachments`` option with ``--issues`` or ``--pulls``, the tool will download user-uploaded attachments (images, videos, documents, etc.) from issue and pull request descriptions and comments. In some circumstances attachments contain valuable data related to the topic, and without their backup important information or context might be lost inadvertently. +When you use the ``--attachments`` option with ``--issues``, ``--pulls`` or ``--discussions``, the tool will download user-uploaded attachments (images, videos, documents, etc.) from issue, pull request and discussion descriptions and comments. In some circumstances attachments contain valuable data related to the topic, and without their backup important information or context might be lost inadvertently. -Attachments are saved to ``issues/attachments/{issue_number}/`` and ``pulls/attachments/{pull_number}/`` directories, where ``{issue_number}`` is the GitHub issue number (e.g., issue #123 saves to ``issues/attachments/123/``). Each attachment directory contains: +Attachments are saved to ``issues/attachments/{issue_number}/``, ``pulls/attachments/{pull_number}/`` and ``discussions/attachments/{discussion_number}/`` directories, where ``{issue_number}`` is the GitHub issue number (e.g., issue #123 saves to ``issues/attachments/123/``). Each attachment directory contains: - The downloaded attachment files (named by their GitHub identifier with appropriate file extensions) - If multiple attachments have the same filename, conflicts are resolved with numeric suffixes (e.g., ``report.pdf``, ``report_1.pdf``, ``report_2.pdf``) @@ -278,6 +288,29 @@ The tool automatically extracts file extensions from HTTP headers to ensure file **Repository filtering** for repo files/assets handles renamed and transferred repositories gracefully. URLs are included if they either match the current repository name directly, or redirect to it (e.g., ``willmcgugan/rich`` redirects to ``Textualize/rich`` after transfer). +**Fine-grained token limitation:** Due to a GitHub platform limitation, fine-grained personal access tokens (``github_pat_...``) cannot download attachments from private repositories directly. This affects both ``/assets/`` (images) and ``/files/`` (documents) URLs. The tool implements a workaround for image attachments using GitHub's Markdown API, which converts URLs to temporary JWT-signed URLs that can be downloaded. However, this workaround only works for images - document attachments (PDFs, text files, etc.) will fail with 404 errors when using fine-grained tokens on private repos. For full attachment support on private repositories, use a classic token (``-t``) instead of a fine-grained token (``-f``). See `#477 `_ for details. + + +About Discussions +----------------- + +GitHub Discussions are backed up with GitHub's GraphQL API because the REST API does not expose discussions. Use ``--discussions`` to save each discussion as JSON under ``repositories/{repo}/discussions/{number}.json``. Discussion backups include the discussion body and metadata, category information, comments, and comment replies. + +``--discussions`` is included in ``--all``. Unlike most REST API-backed resources, discussions require authentication because GitHub's GraphQL API requires a token. Fine-grained personal access tokens and GitHub Apps need read access to the repository's Discussions permission. + +Incremental backups use a per-repository checkpoint at ``repositories/{repo}/discussions/last_update`` based on discussion ``updatedAt`` timestamps. This is separate from the repository-level ``last_update`` file so discussion activity is not missed if the repository's own update timestamp does not change. If you enable ``--discussions`` on an existing incremental backup, the first run performs a full discussions backup for each repository and creates the discussions checkpoint for future runs. + + +About security advisories +------------------------- + +GitHub security advisories are only available in public repositories. GitHub does not provide the respective API endpoint for private repositories. + +Therefore the logic is implemented as follows: +- Security advisories are included in the `--all` option. +- If only the `--all` option was provided, backups of security advisories are skipped for private repositories. +- If the `--security-advisories` option is provided (on its own or in addition to `--all`), a backup of security advisories is attempted for all repositories, with graceful handling if the GitHub API doesn't return any. + Run in Docker container ----------------------- @@ -292,7 +325,12 @@ Gotchas / Known-issues All is not everything --------------------- -The ``--all`` argument does not include: cloning private repos (``-P, --private``), cloning forks (``-F, --fork``), cloning starred repositories (``--all-starred``), ``--pull-details``, cloning LFS repositories (``--lfs``), cloning gists (``--gists``) or cloning starred gist repos (``--starred-gists``). See examples for more. +The ``--all`` argument does not include: downloading attachments from issue and pull request comments (``--attachments``), cloning private repos (``-P, --private``), cloning forks (``-F, --fork``), cloning starred repositories (``--all-starred``), ``--pull-details``, cloning LFS repositories (``--lfs``), cloning gists (``--gists``) or cloning starred gist repos (``--starred-gists``). See examples for more. + +Saves nothing if no arguments are passed +---------------------------------------- + +At least one argument like ``--all`` or ``--repositories`` is needed for github-backup to actually save data. Without relevant arguments, github-backup fetches some data from GitHub but doesn't put any of it into files. Starred repository size ----------------------- @@ -309,25 +347,37 @@ For finer control, avoid using ``--assets`` with starred repos, or use ``--skip- Alternatively, consider just storing links to starred repos in JSON format with ``--starred``. +About pull request reviews +-------------------------- + +Use ``--pull-reviews`` with ``--pulls`` to include GitHub pull request review metadata under each pull request's ``review_data`` key. Reviews are separate from review comments: ``--pull-comments`` backs up inline review comments via ``comment_data`` and regular PR conversation comments via ``comment_regular_data``, while ``--pull-reviews`` backs up review state, submitted time, commit ID, and the top-level review body. + +``--pull-reviews`` is included in ``--all``. Incremental backups use a per-repository checkpoint at ``repositories/{repo}/pulls/reviews_last_update``. If ``--pull-reviews`` is enabled on an existing incremental backup, the first run performs a one-time backfill for pull request reviews so older PRs are not skipped by the existing pull request checkpoint. Existing ``comment_data``, ``comment_regular_data`` and ``commit_data`` fields are preserved when only review data is being added. + + Incremental Backup ------------------ -Using (``-i, --incremental``) will only request new data from the API **since the last run (successful or not)**. e.g. only request issues from the API since the last run. +Using (``-i, --incremental``) will only request new data from the API **since the last successful resource backup**. e.g. only request issues from the API since the last issue backup for that repository. + +Incremental checkpoints for issue and pull request API backups are stored per resource in that repository's backup directory (for example ``repositories/{repo}/issues/last_update``, ``repositories/{repo}/pulls/last_update`` or ``starred/{owner}/{repo}/pulls/last_update``). Older versions stored a single global ``last_update`` file in the output directory root. During migration, the legacy global checkpoint is used as a fallback only for resource directories that already contain backup data but do not yet have their own checkpoint. New repositories or newly enabled resources with no existing data get a full backup instead of inheriting an unrelated global checkpoint. -This means any blocking errors on previous runs can cause a large amount of missing data in backups. +After all existing issue and pull request resource directories have per-resource checkpoints, the legacy global ``last_update`` file is removed automatically. + +This means any blocking errors on previous runs can cause missing data in backups for the affected repository resource. Using (``--incremental-by-files``) will request new data from the API **based on when the file was modified on filesystem**. e.g. if you modify the file yourself you may miss something. -Still saver than the previous version. +Still safer than the previous version. -Specifically, issues and pull requests are handled like this. +Incremental backup only changes how issue and pull request data is fetched. Known blocking errors --------------------- Some errors will block the backup run by exiting the script. e.g. receiving a 403 Forbidden error from the Github API. -If the incremental argument is used, this will result in the next backup only requesting API data since the last blocked/failed run. Potentially causing unexpected large amounts of missing data. +If the incremental argument is used, per-resource checkpoints are only advanced after that resource's backup work completes. A blocking error can still abort the overall run, but repositories and resources that were not processed will keep their previous checkpoints. It's therefore recommended to only use the incremental argument if the output/result is being actively monitored, or complimented with periodic full non-incremental runs, to avoid unexpected missing data in a regular backup runs. @@ -384,12 +434,12 @@ Github Backup Examples Backup all repositories, including private ones using a classic token:: - export ACCESS_TOKEN=SOME-GITHUB-TOKEN + ACCESS_TOKEN=SOME-GITHUB-TOKEN github-backup WhiteHouse --token $ACCESS_TOKEN --organization --output-directory /tmp/white-house --repositories --private Use a fine-grained access token to backup a single organization repository with everything else (wiki, pull requests, comments, issues etc):: - export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN + FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN ORGANIZATION=docker REPO=cli # e.g. git@github.com:docker/cli.git @@ -397,17 +447,17 @@ Use a fine-grained access token to backup a single organization repository with Quietly and incrementally backup useful Github user data (public and private repos with SSH) including; all issues, pulls, all public starred repos and gists (omitting "hooks", "releases" and therefore "assets" to prevent blocking). *Great for a cron job.* :: - export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN + FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER - github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER - + github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --security-advisories --discussions --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER + Debug an error/block or incomplete backup into a temporary directory. Omit "incremental" to fill a previous incomplete backup. :: - export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN + FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN GH_USER=YOUR-GITHUB-USER - github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER + github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-reviews --pull-commits --labels --milestones --discussions --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER Pipe a token from stdin to avoid storing it in environment variables or command history (Unix-like systems only):: @@ -423,7 +473,7 @@ This tool creates backups only, there is no inbuilt restore command. cd /tmp/white-house/repositories/petitions/repository git push --mirror git@github.com:WhiteHouse/petitions.git -**Issues, pull requests, comments, and other metadata** are saved as JSON files for archival purposes. The GitHub API does not support recreating this data faithfully, creating issues via the API has limitations: +**Issues, pull requests, discussions, comments, and other metadata** are saved as JSON files for archival purposes. The GitHub API does not support recreating this data faithfully, creating issues via the API has limitations: - New issue/PR numbers are assigned (original numbers cannot be set) - Timestamps reflect creation time (original dates cannot be set) diff --git a/github_backup/__init__.py b/github_backup/__init__.py index 25dbb4b..b7b61f3 100644 --- a/github_backup/__init__.py +++ b/github_backup/__init__.py @@ -1 +1 @@ -__version__ = "0.59.0" +__version__ = "0.62.1" diff --git a/github_backup/cli.py b/github_backup/cli.py index 54849d4..987ae71 100644 --- a/github_backup/cli.py +++ b/github_backup/cli.py @@ -46,6 +46,16 @@ def main(): "Use -t/--token or -f/--token-fine to authenticate." ) + # Issue #477: Fine-grained PATs cannot download all attachment types from + # private repos. Image attachments will be retried via Markdown API workaround. + if args.include_attachments and args.token_fine: + logger.warning( + "Using --attachments with fine-grained token. Due to GitHub platform " + "limitations, file attachments (PDFs, etc.) from private repos may fail. " + "Image attachments will be retried via workaround. For full attachment " + "support, use --token-classic instead." + ) + if args.quiet: logger.setLevel(logging.WARNING) diff --git a/github_backup/github_backup.py b/github_backup/github_backup.py index 1d4e354..fd8a080 100644 --- a/github_backup/github_backup.py +++ b/github_backup/github_backup.py @@ -6,7 +6,6 @@ import base64 import calendar import codecs -import errno import json import logging import os @@ -33,17 +32,23 @@ except ImportError: VERSION = "unknown" -FNULL = open(os.devnull, "w") +from .graphql_queries import ( + DISCUSSION_DETAIL_QUERY, + DISCUSSION_LIST_QUERY, + DISCUSSION_PAGE_SIZE, + DISCUSSION_REPLIES_QUERY, +) + FILE_URI_PREFIX = "file://" logger = logging.getLogger(__name__) class RepositoryUnavailableError(Exception): - """Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown).""" + """Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown, TOS violation).""" - def __init__(self, message, dmca_url=None): + def __init__(self, message, legal_url=None): super().__init__(message) - self.dmca_url = dmca_url + self.legal_url = legal_url # Setup SSL context with fallback chain @@ -74,9 +79,6 @@ def __init__(self, message, dmca_url=None): " 3. Debian/Ubuntu: apt-get install ca-certificates\n\n" ) -# Retry configuration -MAX_RETRIES = 5 - def logging_subprocess( popenargs, stdout_log_level=logging.DEBUG, stderr_log_level=logging.ERROR, **kwargs @@ -124,13 +126,7 @@ def check_io(): def mkdir_p(*args): for path in args: - try: - os.makedirs(path) - except OSError as exc: # Python >2.5 - if exc.errno == errno.EEXIST and os.path.isdir(path): - pass - else: - raise + os.makedirs(path, exist_ok=True) def mask_password(url, secret="*****"): @@ -144,6 +140,17 @@ def mask_password(url, secret="*****"): return url.replace(parsed.password, secret) +def non_negative_int(value): + """Argparse type validator for non-negative integers.""" + try: + ivalue = int(value) + except ValueError: + raise argparse.ArgumentTypeError(f"'{value}' is not a valid integer") + if ivalue < 0: + raise argparse.ArgumentTypeError(f"{value} must be 0 or greater") + return ivalue + + def parse_args(args=None): parser = argparse.ArgumentParser(description="Backup a github account") parser.add_argument("user", metavar="USER", type=str, help="github username") @@ -159,6 +166,12 @@ def parse_args(args=None): dest="token_fine", help="fine-grained personal access token (github_pat_....), or path to token (file://...)", ) # noqa + parser.add_argument( + "--token-from-gh", + action="store_true", + dest="token_from_gh", + help="read token from GitHub CLI (gh auth token)", + ) parser.add_argument( "-q", "--quiet", @@ -272,6 +285,12 @@ def parse_args(args=None): dest="include_pull_comments", help="include pull request review comments in backup", ) + parser.add_argument( + "--pull-reviews", + action="store_true", + dest="include_pull_reviews", + help="include pull request reviews in backup", + ) parser.add_argument( "--pull-commits", action="store_true", @@ -302,6 +321,18 @@ def parse_args(args=None): dest="include_milestones", help="include milestones in backup", ) + parser.add_argument( + "--security-advisories", + action="store_true", + dest="include_security_advisories", + help="include security advisories in backup", + ) + parser.add_argument( + "--discussions", + action="store_true", + dest="include_discussions", + help="include discussions in backup", + ) parser.add_argument( "--repositories", action="store_true", @@ -449,7 +480,7 @@ def parse_args(args=None): "--attachments", action="store_true", dest="include_attachments", - help="download user-attachments from issues and pull requests", + help="download user-attachments from issues, pull requests, and discussions [*]", ) parser.add_argument( "--throttle-limit", @@ -468,6 +499,13 @@ def parse_args(args=None): parser.add_argument( "--exclude", dest="exclude", help="names of repositories to exclude", nargs="*" ) + parser.add_argument( + "--retries", + dest="max_retries", + type=non_negative_int, + default=5, + help="maximum number of retries for API calls (default: 5)", + ) return parser.parse_args(args) @@ -483,19 +521,18 @@ def get_auth(args, encode=True, for_git_cli=False): if platform.system() != "Darwin": raise Exception("Keychain arguments are only supported on Mac OSX") try: - with open(os.devnull, "w") as devnull: - token = subprocess.check_output( - [ - "security", - "find-generic-password", - "-s", - args.osx_keychain_item_name, - "-a", - args.osx_keychain_item_account, - "-w", - ], - stderr=devnull, - ).strip() + token = subprocess.check_output( + [ + "security", + "find-generic-password", + "-s", + args.osx_keychain_item_name, + "-a", + args.osx_keychain_item_account, + "-w", + ], + stderr=subprocess.DEVNULL, + ).strip() token = token.decode("utf-8") auth = token + ":" + "x-oauth-basic" except subprocess.SubprocessError: @@ -508,7 +545,7 @@ def get_auth(args, encode=True, for_git_cli=False): ) elif args.token_fine: if args.token_fine.startswith(FILE_URI_PREFIX): - args.token_fine = read_file_contents(args.token_fine) + args.token_fine = read_first_line(args.token_fine) if args.token_fine.startswith("github_pat_"): auth = args.token_fine @@ -516,9 +553,15 @@ def get_auth(args, encode=True, for_git_cli=False): raise Exception( "Fine-grained token supplied does not look like a GitHub PAT" ) - elif args.token_classic: - if args.token_classic.startswith(FILE_URI_PREFIX): - args.token_classic = read_file_contents(args.token_classic) + elif args.token_classic or args.token_from_gh: + if args.token_from_gh: + if args.as_app: + raise Exception( + "--token-from-gh cannot be used with --as-app; provide the app token with --token instead" + ) + args.token_classic = read_token_from_gh_cli(args) + elif args.token_classic.startswith(FILE_URI_PREFIX): + args.token_classic = read_first_line(args.token_classic) if not args.as_app: auth = args.token_classic + ":" + "x-oauth-basic" @@ -546,6 +589,31 @@ def get_github_api_host(args): return host +def get_github_graphql_url(args): + if args.github_host: + return "https://{0}/api/graphql".format(args.github_host) + + return "https://api.github.com/graphql" + + +def get_graphql_auth(args): + auth = get_auth(args, encode=False) + if not auth: + return None + + # GraphQL expects a bearer token. Classic tokens and keychain tokens use + # "token:x-oauth-basic" for REST Basic auth, so strip the synthetic + # password before sending the GraphQL Authorization header. + if ( + not getattr(args, "as_app", False) + and getattr(args, "token_fine", None) is None + and ":" in auth + ): + auth = auth.split(":", 1)[0] + + return auth + + def get_github_host(args): if args.github_host: host = args.github_host @@ -555,8 +623,41 @@ def get_github_host(args): return host -def read_file_contents(file_uri): - return open(file_uri[len(FILE_URI_PREFIX) :], "rt").readline().strip() +def read_first_line(file_uri): + with open(file_uri[len(FILE_URI_PREFIX) :], "rt") as f: + return f.readline().strip() + + +def read_token_from_gh_cli(args): + cached_token = getattr(args, "_token_from_gh_value", None) + if cached_token: + return cached_token + + command = ["gh", "auth", "token"] + if args.github_host: + command.extend(["--hostname", get_github_host(args)]) + + try: + token = subprocess.check_output(command, stderr=subprocess.PIPE).decode( + "utf-8" + ).strip() + except FileNotFoundError: + raise Exception( + "Unable to read token from GitHub CLI: 'gh' executable not found" + ) + except subprocess.CalledProcessError as e: + stderr = e.stderr.decode("utf-8", errors="replace").strip() + if stderr: + raise Exception( + "Unable to read token from GitHub CLI: {0}".format(stderr) + ) + raise Exception("Unable to read token from GitHub CLI") + + if not token: + raise Exception("Unable to read token from GitHub CLI: token was empty") + + args._token_from_gh_value = token + return token def get_github_repo_url(args, repository): @@ -608,11 +709,12 @@ def calculate_retry_delay(attempt, headers): return delay + random.uniform(0, delay * 0.1) -def retrieve_data(args, template, query_args=None, paginated=True): +def retrieve_data(args, template, query_args=None, paginated=True, lazy=False): """ Fetch the data from GitHub API. - Handle both single requests and pagination with yield of individual dicts. + Handle both single requests and pagination. Returns a list by default, or + a generator when lazy=True so callers can stop before fetching every page. Handles throttling, retries, read errors, and DMCA takedowns. """ query_args = query_args or {} @@ -622,16 +724,24 @@ def retrieve_data(args, template, query_args=None, paginated=True): def _extract_next_page_url(link_header): for link in link_header.split(","): if 'rel="next"' in link: - return link[link.find("<") + 1:link.find(">")] + return link[link.find("<") + 1 : link.find(">")] return None def fetch_all() -> Generator[dict, None, None]: + def _extract_legal_url(response_body_bytes): + """Extract DMCA/legal notice URL from GitHub API error response body.""" + try: + data = json.loads(response_body_bytes.decode("utf-8")) + return data.get("block", {}).get("html_url") + except Exception: + return None + next_url = None while True: # FIRST: Fetch response - for attempt in range(MAX_RETRIES): + for attempt in range(args.max_retries + 1): request = _construct_request( per_page=per_page if paginated else None, query_args=query_args, @@ -640,53 +750,73 @@ def fetch_all() -> Generator[dict, None, None]: as_app=args.as_app, fine=args.token_fine is not None, ) - http_response = make_request_with_retry(request, auth) - - match http_response.getcode(): - case 200: - # Success - Parse JSON response - try: - response = json.loads(http_response.read().decode("utf-8")) - break # Exit retry loop and handle the data returned - except ( - IncompleteRead, - json.decoder.JSONDecodeError, - TimeoutError, - ) as e: - logger.warning(f"{type(e).__name__} reading response") - if attempt < MAX_RETRIES - 1: - delay = calculate_retry_delay(attempt, {}) - logger.warning( - f"Retrying in {delay:.1f}s (attempt {attempt + 1}/{MAX_RETRIES})" - ) - time.sleep(delay) - continue # Next retry attempt - - case 451: - # DMCA takedown - extract URL if available, then raise - dmca_url = None - try: - response_data = json.loads( - http_response.read().decode("utf-8") - ) - dmca_url = response_data.get("block", {}).get("html_url") - except Exception: - pass + try: + http_response = make_request_with_retry( + request, auth, args.max_retries + ) + except HTTPError as exc: + if exc.code == 451: + legal_url = _extract_legal_url(exc.read()) raise RepositoryUnavailableError( - "Repository unavailable due to legal reasons (HTTP 451)", - dmca_url=dmca_url, + f"Repository unavailable due to legal reasons (HTTP {exc.code})", + legal_url=legal_url, ) + elif exc.code == 403: + # Rate-limit 403s (x-ratelimit-remaining=0) are retried + # by make_request_with_retry — re-raise if exhausted. + if int(exc.headers.get("x-ratelimit-remaining", 1)) < 1: + raise + # Only convert to RepositoryUnavailableError if GitHub + # indicates a TOS/DMCA block (response contains "block" + # key). Other 403s (permissions, scopes) should propagate. + body = exc.read() + try: + data = json.loads(body.decode("utf-8")) + except Exception: + data = {} + if "block" in data: + raise RepositoryUnavailableError( + "Repository access blocked (HTTP 403)", + legal_url=data.get("block", {}).get("html_url"), + ) + raise + else: + raise + + # urlopen raises HTTPError for non-2xx, so only success gets here. + # Guard against unexpected status codes from proxies, future Python + # changes, or other edge cases we haven't considered. + status = http_response.getcode() + if status != 200: + raise Exception( + f"Unexpected HTTP {status} from {next_url or template} " + f"(expected non-2xx to raise HTTPError)" + ) - case _: - raise Exception( - f"API request returned HTTP {http_response.getcode()}: {http_response.reason}" + # Parse JSON response + try: + response = json.loads(http_response.read().decode("utf-8")) + break # Exit retry loop and handle the data returned + except ( + ConnectionError, + IncompleteRead, + json.decoder.JSONDecodeError, + TimeoutError, + ) as e: + logger.warning(f"{type(e).__name__} reading response") + if attempt < args.max_retries: + delay = calculate_retry_delay(attempt, {}) + logger.warning( + f"Retrying read in {delay:.1f}s (attempt {attempt + 1}/{args.max_retries + 1})" ) + time.sleep(delay) + continue # Next retry attempt else: logger.error( - f"Failed to read response after {MAX_RETRIES} attempts for {next_url or template}" + f"Failed to read response after {args.max_retries + 1} attempts for {next_url or template}" ) raise Exception( - f"Failed to read response after {MAX_RETRIES} attempts for {next_url or template}" + f"Failed to read response after {args.max_retries + 1} attempts for {next_url or template}" ) # SECOND: Process and paginate @@ -715,10 +845,94 @@ def fetch_all() -> Generator[dict, None, None]: ): break # No more data + if lazy: + return fetch_all() + return list(fetch_all()) -def make_request_with_retry(request, auth): +def retrieve_graphql_data(args, query, variables=None, log_context=None): + """Fetch data from GitHub's GraphQL API.""" + auth = get_graphql_auth(args) + if not auth: + raise Exception("GitHub GraphQL API requires authentication") + + variables = variables or {} + payload = json.dumps( + {"query": query, "variables": variables}, ensure_ascii=False + ).encode("utf-8") + endpoint = get_github_graphql_url(args) + + for attempt in range(args.max_retries + 1): + request = Request(endpoint, data=payload, method="POST") + request.add_header("Accept", "application/json") + request.add_header("Content-Type", "application/json") + request.add_header("Authorization", "bearer " + auth) + log_url = endpoint + if log_context: + log_url = "{0} ({1})".format(log_url, log_context) + logger.info("Requesting {0}".format(log_url)) + + http_response = make_request_with_retry(request, auth, args.max_retries) + + status = http_response.getcode() + if status != 200: + raise Exception( + f"Unexpected HTTP {status} from {endpoint} " + f"(expected non-2xx to raise HTTPError)" + ) + + try: + response = json.loads(http_response.read().decode("utf-8")) + except (IncompleteRead, json.decoder.JSONDecodeError, TimeoutError) as e: + logger.warning(f"{type(e).__name__} reading GraphQL response") + if attempt < args.max_retries: + delay = calculate_retry_delay(attempt, {}) + logger.warning( + f"Retrying GraphQL read in {delay:.1f}s " + f"(attempt {attempt + 1}/{args.max_retries + 1})" + ) + time.sleep(delay) + continue + raise Exception( + f"Failed to read GraphQL response after {args.max_retries + 1} " + f"attempts for {endpoint}" + ) + + if ( + remaining := int(http_response.headers.get("x-ratelimit-remaining", 0)) + ) <= (args.throttle_limit or 0): + if args.throttle_limit: + logger.info( + f"Throttling: {remaining} requests left, pausing {args.throttle_pause}s" + ) + time.sleep(args.throttle_pause) + + errors = response.get("errors") or [] + if errors: + if any(error.get("type") == "RATE_LIMITED" for error in errors): + if attempt < args.max_retries: + delay = calculate_retry_delay(attempt, http_response.headers) + logger.warning( + f"GraphQL rate limit hit, retrying in {delay:.1f}s " + f"(attempt {attempt + 1}/{args.max_retries + 1})" + ) + time.sleep(delay) + continue + + messages = "; ".join( + error.get("message", str(error)) for error in errors + ) + raise Exception("GraphQL Error: {0}".format(messages)) + + return response.get("data", {}) + + raise Exception( + f"GraphQL request failed after {args.max_retries + 1} attempts" + ) # pragma: no cover + + +def make_request_with_retry(request, auth, max_retries=5): """Make HTTP request with automatic retry for transient errors.""" def is_retryable_status(status_code, headers): @@ -730,40 +944,49 @@ def is_retryable_status(status_code, headers): return int(headers.get("x-ratelimit-remaining", 1)) < 1 return False - for attempt in range(MAX_RETRIES): + for attempt in range(max_retries + 1): try: return urlopen(request, context=https_ctx) except HTTPError as exc: # HTTPError can be used as a response-like object if not is_retryable_status(exc.code, exc.headers): + logger.error( + f"API Error: {exc.code} {exc.reason} for {request.full_url}" + ) raise # Non-retryable error - if attempt >= MAX_RETRIES - 1: - logger.error(f"HTTP {exc.code} failed after {MAX_RETRIES} attempts") + if attempt >= max_retries: + logger.error( + f"HTTP {exc.code} failed after {max_retries + 1} attempts for {request.full_url}" + ) raise delay = calculate_retry_delay(attempt, exc.headers) logger.warning( - f"HTTP {exc.code}, retrying in {delay:.1f}s " - f"(attempt {attempt + 1}/{MAX_RETRIES})" + f"HTTP {exc.code} ({exc.reason}), retrying in {delay:.1f}s " + f"(attempt {attempt + 1}/{max_retries + 1}) for {request.full_url}" ) if auth is None and exc.code in (403, 429): logger.info("Hint: Authenticate to raise your GitHub rate limit") time.sleep(delay) except (URLError, socket.error) as e: - if attempt >= MAX_RETRIES - 1: - logger.error(f"Connection error failed after {MAX_RETRIES} attempts: {e}") + if attempt >= max_retries: + logger.error( + f"Connection error failed after {max_retries + 1} attempts: {e} for {request.full_url}" + ) raise delay = calculate_retry_delay(attempt, {}) logger.warning( f"Connection error: {e}, retrying in {delay:.1f}s " - f"(attempt {attempt + 1}/{MAX_RETRIES})" + f"(attempt {attempt + 1}/{max_retries + 1}) for {request.full_url}" ) time.sleep(delay) - raise Exception(f"Request failed after {MAX_RETRIES} attempts") # pragma: no cover + raise Exception( + f"Request failed after {max_retries + 1} attempts" + ) # pragma: no cover def _construct_request(per_page, query_args, template, auth, as_app=None, fine=False): @@ -1032,8 +1255,67 @@ def download_attachment_file(url, path, auth, as_app=False, fine=False): return metadata +def get_jwt_signed_url_via_markdown_api(url, token, repo_context): + """Convert a user-attachments/assets URL to a JWT-signed URL via Markdown API. + + GitHub's Markdown API renders image URLs and returns HTML containing + JWT-signed private-user-images.githubusercontent.com URLs that work + without token authentication. + + This is a workaround for issue #477 where fine-grained PATs cannot + download user-attachments URLs from private repos directly. + + Limitations: + - Only works for /assets/ URLs (images) + - Does NOT work for /files/ URLs (PDFs, text files, etc.) + - JWT URLs expire after ~5 minutes + + Args: + url: The github.com/user-attachments/assets/UUID URL + token: Raw fine-grained PAT (github_pat_...) + repo_context: Repository context as "owner/repo" + + Returns: + str: JWT-signed URL from private-user-images.githubusercontent.com + None: If conversion fails + """ + + try: + payload = json.dumps( + {"text": f"![img]({url})", "mode": "gfm", "context": repo_context} + ).encode("utf-8") + + request = Request("https://api.github.com/markdown", data=payload, method="POST") + request.add_header("Authorization", f"token {token}") + request.add_header("Content-Type", "application/json") + request.add_header("Accept", "application/vnd.github+json") + + html = urlopen(request, context=https_ctx, timeout=30).read().decode("utf-8") + + # Parse JWT-signed URL from HTML response + # Format: + if match := re.search( + r'src="(https://private-user-images\.githubusercontent\.com/[^"]+)"', html + ): + jwt_url = match.group(1) + logger.debug("Converted attachment URL to JWT-signed URL via Markdown API") + return jwt_url + + logger.debug("Markdown API response did not contain JWT-signed URL") + return None + + except HTTPError as e: + logger.debug( + "Markdown API request failed with HTTP {0}: {1}".format(e.code, e.reason) + ) + return None + except Exception as e: + logger.debug("Markdown API request failed: {0}".format(str(e))) + return None + + def extract_attachment_urls(item_data, issue_number=None, repository_full_name=None): - """Extract GitHub-hosted attachment URLs from issue/PR body and comments. + """Extract GitHub-hosted attachment URLs from issue/PR/discussion body and comments. What qualifies as an attachment? There is no "attachment" concept in the GitHub API - it's a user behavior pattern @@ -1175,33 +1457,29 @@ def redirect_request(self, req, fp, code, msg, headers, newurl): # and exclude the URL to avoid downloading from wrong repos return False + def extract_from_text(text): + text_cleaned = remove_code_blocks(text or "") + for pattern in patterns: + found_urls = re.findall(pattern, text_cleaned) + urls.extend([clean_url(url) for url in found_urls]) + + def extract_from_comments(comments): + for comment in comments: + extract_from_text(comment.get("body") or "") + # GitHub Discussions support one level of replies. Issues and pull + # requests don't have reply_data, so this is a no-op for them. + extract_from_comments(comment.get("reply_data") or []) + # Extract from body - body = item_data.get("body") or "" - # Remove code blocks before searching for URLs - body_cleaned = remove_code_blocks(body) - for pattern in patterns: - found_urls = re.findall(pattern, body_cleaned) - urls.extend([clean_url(url) for url in found_urls]) - - # Extract from issue comments + extract_from_text(item_data.get("body") or "") + + # Extract from issue comments and discussion comments if "comment_data" in item_data: - for comment in item_data["comment_data"]: - comment_body = comment.get("body") or "" - # Remove code blocks before searching for URLs - comment_cleaned = remove_code_blocks(comment_body) - for pattern in patterns: - found_urls = re.findall(pattern, comment_cleaned) - urls.extend([clean_url(url) for url in found_urls]) + extract_from_comments(item_data["comment_data"]) # Extract from PR regular comments if "comment_regular_data" in item_data: - for comment in item_data["comment_regular_data"]: - comment_body = comment.get("body") or "" - # Remove code blocks before searching for URLs - comment_cleaned = remove_code_blocks(comment_body) - for pattern in patterns: - found_urls = re.findall(pattern, comment_cleaned) - urls.extend([clean_url(url) for url in found_urls]) + extract_from_comments(item_data["comment_regular_data"]) regex_urls = list(set(urls)) # dedupe @@ -1303,20 +1581,24 @@ def resolve_filename_collision(filepath): def download_attachments( args, item_cwd, item_data, number, repository, item_type="issue" ): - """Download user-attachments from issue/PR body and comments with manifest. + """Download user-attachments from issue/PR/discussion body and comments with manifest. Args: args: Command line arguments - item_cwd: Working directory (issue_cwd or pulls_cwd) - item_data: Issue or PR data dict - number: Issue or PR number + item_cwd: Working directory (issue_cwd, pulls_cwd, or discussion_cwd) + item_data: Issue, PR, or discussion data dict + number: Issue, PR, or discussion number repository: Repository dict - item_type: "issue" or "pull" for logging/manifest + item_type: "issue", "pull", or "discussion" for logging/manifest """ import json from datetime import datetime, timezone - item_type_display = "issue" if item_type == "issue" else "pull request" + item_type_display = { + "issue": "issue", + "pull": "pull request", + "discussion": "discussion", + }.get(item_type, item_type) urls = extract_attachment_urls( item_data, issue_number=number, repository_full_name=repository["full_name"] @@ -1385,15 +1667,46 @@ def download_attachments( filename = get_attachment_filename(url) filepath = os.path.join(attachments_dir, filename) - # Download and get metadata - metadata = download_attachment_file( - url, - filepath, - get_auth(args, encode=not args.as_app), - as_app=args.as_app, - fine=args.token_fine is not None, + # Issue #477: Fine-grained PATs cannot download user-attachments/assets + # from private repos directly (404). Use Markdown API workaround to get + # a JWT-signed URL. Only works for /assets/ (images), not /files/. + needs_jwt = ( + args.token_fine is not None + and repository.get("private", False) + and "github.com/user-attachments/assets/" in url ) + if not needs_jwt: + # NORMAL download path + metadata = download_attachment_file( + url, + filepath, + get_auth(args, encode=not args.as_app), + as_app=args.as_app, + fine=args.token_fine is not None, + ) + elif jwt_url := get_jwt_signed_url_via_markdown_api( + url, args.token_fine, repository["full_name"] + ): + # JWT needed and extracted, download via JWT + metadata = download_attachment_file( + jwt_url, filepath, auth=None, as_app=False, fine=False + ) + metadata["url"] = url # Apply back the original URL + metadata["jwt_workaround"] = True + else: + # Markdown API workaround failed - skip download we know will fail + metadata = { + "url": url, + "success": False, + "skipped_at": datetime.now(timezone.utc).isoformat(), + "error": "Fine-grained token cannot download private repo attachments. " + "Markdown API workaround failed. Use --token-classic instead.", + } + logger.warning( + "Skipping attachment {0}: {1}".format(url, metadata["error"]) + ) + # If download succeeded but we got an extension from Content-Disposition, # we may need to rename the file to add the extension if metadata["success"] and metadata.get("original_filename"): @@ -1430,6 +1743,8 @@ def download_attachments( # Write manifest if attachment_metadata_list: manifest = { + "item_number": number, + "item_type": item_type, "issue_number": number, "issue_type": item_type, "repository": ( @@ -1459,7 +1774,10 @@ def get_authenticated_user(args): def check_git_lfs_install(): - exit_code = subprocess.call(["git", "lfs", "version"]) + exit_code = subprocess.call( + ["git", "lfs", "version"], stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) if exit_code != 0: raise Exception( "The argument --lfs requires you to have Git LFS installed.\nYou can get it from https://git-lfs.github.com." @@ -1494,7 +1812,13 @@ def retrieve_repositories(args, authenticated_user): paginated = False template = "https://{0}/repos/{1}".format(get_github_api_host(args), repo_path) - repos = retrieve_data(args, template, paginated=paginated) + try: + repos = retrieve_data(args, template, paginated=paginated) + except RepositoryUnavailableError as e: + logger.warning(f"Repository is unavailable: {e}") + if e.legal_url: + logger.warning(f"Legal notice: {e.legal_url}") + return [] if args.all_starred: starred_template = "https://{0}/users/{1}/starred".format( @@ -1579,9 +1903,7 @@ def filter_repositories(args, unfiltered_repositories): repositories = [r for r in repositories if not r.get("archived")] if args.starred_skip_size_over is not None: if args.starred_skip_size_over <= 0: - logger.warning( - "--starred-skip-size-over must be greater than 0, ignoring" - ) + logger.warning("--starred-skip-size-over must be greater than 0, ignoring") else: size_limit_kb = args.starred_skip_size_over * 1024 filtered = [] @@ -1590,7 +1912,9 @@ def filter_repositories(args, unfiltered_repositories): size_mb = r.get("size", 0) / 1024 logger.info( "Skipping starred repo {0} ({1:.0f} MB) due to --starred-skip-size-over {2}".format( - r.get("full_name", r.get("name")), size_mb, args.starred_skip_size_over + r.get("full_name", r.get("name")), + size_mb, + args.starred_skip_size_over, ) ) else: @@ -1604,26 +1928,140 @@ def filter_repositories(args, unfiltered_repositories): return repositories +INCREMENTAL_LAST_UPDATE_FILENAME = "last_update" +INCREMENTAL_RESOURCE_DIRECTORIES = ("issues", "pulls") + + +def get_repository_checkpoint_time(repository): + timestamps = [ + timestamp + for timestamp in (repository.get("updated_at"), repository.get("pushed_at")) + if timestamp + ] + if timestamps: + return max(timestamps) + + return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) + + +def resource_backup_exists(resource_cwd): + if not os.path.isdir(resource_cwd): + return False + + ignored_names = { + INCREMENTAL_LAST_UPDATE_FILENAME, + PULL_REVIEWS_LAST_UPDATE_FILENAME, + } + for name in os.listdir(resource_cwd): + if name in ignored_names or name.endswith(".temp"): + continue + return True + + return False + + +def read_legacy_last_update(args, output_directory): + if not args.incremental: + return None, None + + last_update_path = os.path.join(output_directory, INCREMENTAL_LAST_UPDATE_FILENAME) + try: + with open(last_update_path) as f: + return last_update_path, f.read().strip() + except FileNotFoundError: + return last_update_path, None + + +def read_resource_last_update(args, resource_cwd, legacy_last_update=None): + if not args.incremental: + return None + + last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME) + try: + with open(last_update_path) as f: + return f.read().strip() + except FileNotFoundError: + if legacy_last_update and resource_backup_exists(resource_cwd): + return legacy_last_update + return None + + +def write_resource_last_update(args, resource_cwd, repository): + if not args.incremental: + return + + mkdir_p(resource_cwd) + last_update_path = os.path.join(resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME) + with open(last_update_path, "w") as f: + f.write(get_repository_checkpoint_time(repository)) + + +def iter_incremental_resource_dirs(output_directory): + repositories_dir = os.path.join(output_directory, "repositories") + if os.path.isdir(repositories_dir): + for repository_name in os.listdir(repositories_dir): + repo_cwd = os.path.join(repositories_dir, repository_name) + if not os.path.isdir(repo_cwd): + continue + for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES: + yield os.path.join(repo_cwd, resource_name) + + starred_dir = os.path.join(output_directory, "starred") + if os.path.isdir(starred_dir): + for owner_name in os.listdir(starred_dir): + owner_cwd = os.path.join(starred_dir, owner_name) + if not os.path.isdir(owner_cwd): + continue + for repository_name in os.listdir(owner_cwd): + repo_cwd = os.path.join(owner_cwd, repository_name) + if not os.path.isdir(repo_cwd): + continue + for resource_name in INCREMENTAL_RESOURCE_DIRECTORIES: + yield os.path.join(repo_cwd, resource_name) + + +def has_unmigrated_incremental_resources(output_directory): + for resource_cwd in iter_incremental_resource_dirs(output_directory): + last_update_path = os.path.join( + resource_cwd, INCREMENTAL_LAST_UPDATE_FILENAME + ) + if resource_backup_exists(resource_cwd) and not os.path.exists( + last_update_path + ): + return True + + return False + + +def remove_legacy_last_update_if_migrated( + args, output_directory, legacy_last_update_path +): + if not args.incremental or not legacy_last_update_path: + return + if not os.path.exists(legacy_last_update_path): + return + if has_unmigrated_incremental_resources(output_directory): + logger.info( + "Keeping legacy global last_update until all existing issue/pull " + "backups have per-resource checkpoints" + ) + return + + os.remove(legacy_last_update_path) + logger.info( + "Removed legacy global last_update after migrating incremental checkpoints" + ) + + def backup_repositories(args, output_directory, repositories): logger.info("Backing up repositories") repos_template = "https://{0}/repos".format(get_github_api_host(args)) + legacy_last_update_path, legacy_last_update = read_legacy_last_update( + args, output_directory + ) + incremental_resource_work_attempted = False - if args.incremental: - last_update_path = os.path.join(output_directory, "last_update") - if os.path.exists(last_update_path): - args.since = open(last_update_path).read().strip() - else: - args.since = None - else: - args.since = None - - last_update = "0000-00-00T00:00:00Z" for repository in repositories: - if "updated_at" in repository and repository["updated_at"] > last_update: - last_update = repository["updated_at"] - elif "pushed_at" in repository and repository["pushed_at"] > last_update: - last_update = repository["pushed_at"] - if repository.get("is_gist"): repo_cwd = os.path.join(output_directory, "gists", repository["id"]) elif repository.get("is_starred"): @@ -1686,14 +2124,34 @@ def backup_repositories(args, output_directory, repositories): no_prune=args.no_prune, ) if args.include_issues or args.include_everything: + incremental_resource_work_attempted = True + issue_cwd = os.path.join(repo_cwd, "issues") + args.since = read_resource_last_update( + args, issue_cwd, legacy_last_update + ) backup_issues(args, repo_cwd, repository, repos_template) + write_resource_last_update(args, issue_cwd, repository) if args.include_pulls or args.include_everything: + incremental_resource_work_attempted = True + pulls_cwd = os.path.join(repo_cwd, "pulls") + args.since = read_resource_last_update( + args, pulls_cwd, legacy_last_update + ) backup_pulls(args, repo_cwd, repository, repos_template) + write_resource_last_update(args, pulls_cwd, repository) + + if args.include_discussions or args.include_everything: + backup_discussions(args, repo_cwd, repository) if args.include_milestones or args.include_everything: backup_milestones(args, repo_cwd, repository, repos_template) + if args.include_security_advisories or ( + args.include_everything and not repository.get("private", False) + ): + backup_security_advisories(args, repo_cwd, repository, repos_template) + if args.include_labels or args.include_everything: backup_labels(args, repo_cwd, repository, repos_template) @@ -1709,19 +2167,329 @@ def backup_repositories(args, output_directory, repositories): include_assets=args.include_assets or args.include_everything, ) except RepositoryUnavailableError as e: + logger.warning(f"Repository {repository['full_name']} is unavailable: {e}") + if e.legal_url: + logger.warning(f"Legal notice: {e.legal_url}") + logger.info(f"Skipping remaining resources for {repository['full_name']}") + continue + + if incremental_resource_work_attempted: + remove_legacy_last_update_if_migrated( + args, output_directory, legacy_last_update_path + ) + + +def _repository_owner_name(repository): + return repository["full_name"].split("/", 1) + + +def _connection_nodes(connection): + return [node for node in (connection or {}).get("nodes") or [] if node] + + +def retrieve_discussion_summaries(args, repository, since=None): + owner, name = _repository_owner_name(repository) + after = None + page = 1 + summaries = [] + newest_seen = None + discussions_enabled = None + total_count = 0 + + while True: + data = retrieve_graphql_data( + args, + DISCUSSION_LIST_QUERY, + { + "owner": owner, + "name": name, + "after": after, + "pageSize": DISCUSSION_PAGE_SIZE, + }, + log_context="discussion summaries {0} page {1}".format( + repository["full_name"], page + ), + ) + repository_data = data.get("repository") + if repository_data is None: + raise Exception( + "Repository {0} not found in GraphQL response".format( + repository["full_name"] + ) + ) + + discussions_enabled = repository_data.get("hasDiscussionsEnabled") + if not discussions_enabled: + return [], None, False, 0 + + discussions = repository_data.get("discussions") or {} + total_count = discussions.get("totalCount", total_count) + stop = False + + for discussion in _connection_nodes(discussions): + updated_at = discussion.get("updatedAt") + if updated_at and (newest_seen is None or updated_at > newest_seen): + newest_seen = updated_at + + if since and updated_at and updated_at <= since: + stop = True + break + + summaries.append(discussion) + + page_info = discussions.get("pageInfo") or {} + if stop or not page_info.get("hasNextPage"): + break + + after = page_info.get("endCursor") + page += 1 + + return summaries, newest_seen, discussions_enabled, total_count + + +def retrieve_discussion_comment_replies(args, comment_id, after=None, log_context=None): + data = retrieve_graphql_data( + args, + DISCUSSION_REPLIES_QUERY, + { + "commentId": comment_id, + "repliesCursor": after, + "pageSize": DISCUSSION_PAGE_SIZE, + }, + log_context=log_context, + ) + node = data.get("node") or {} + return node.get("replies") or {} + + +def _discussion_comment_log_identifier(comment_node): + return ( + comment_node.get("databaseId") + or comment_node.get("url") + or comment_node.get("id") + ) + + +def _discussion_comment_with_replies( + args, comment_node, repository_full_name=None, discussion_number=None +): + replies_connection = comment_node.get("replies") or {} + replies = _connection_nodes(replies_connection) + reply_total_count = replies_connection.get("totalCount", len(replies)) + page_info = replies_connection.get("pageInfo") or {} + reply_page = 2 + + while page_info.get("hasNextPage"): + log_context = None + if repository_full_name and discussion_number is not None: + log_context = "discussion {0}#{1} comment {2} replies page {3}".format( + repository_full_name, + discussion_number, + _discussion_comment_log_identifier(comment_node), + reply_page, + ) + + replies_connection = retrieve_discussion_comment_replies( + args, + comment_node["id"], + page_info.get("endCursor"), + log_context=log_context, + ) + replies.extend(_connection_nodes(replies_connection)) + page_info = replies_connection.get("pageInfo") or {} + reply_page += 1 + + comment = {key: value for key, value in comment_node.items() if key != "replies"} + comment["reply_count"] = reply_total_count + comment["reply_data"] = replies + return comment + + +def retrieve_discussion(args, repository, number): + owner, name = _repository_owner_name(repository) + comments_cursor = None + comments_page = 1 + discussion_data = None + comments = [] + comment_total_count = 0 + + while True: + data = retrieve_graphql_data( + args, + DISCUSSION_DETAIL_QUERY, + { + "owner": owner, + "name": name, + "number": number, + "commentsCursor": comments_cursor, + "pageSize": DISCUSSION_PAGE_SIZE, + }, + log_context="discussion {0}#{1} details/comments page {2}".format( + repository["full_name"], number, comments_page + ), + ) + repository_data = data.get("repository") or {} + discussion = repository_data.get("discussion") + if discussion is None: + raise Exception( + "Discussion #{0} not found in {1}".format( + number, repository["full_name"] + ) + ) + + if discussion_data is None: + discussion_data = { + key: value for key, value in discussion.items() if key != "comments" + } + + comments_connection = discussion.get("comments") or {} + comment_total_count = comments_connection.get( + "totalCount", comment_total_count + ) + for comment_node in _connection_nodes(comments_connection): + comments.append( + _discussion_comment_with_replies( + args, comment_node, repository["full_name"], number + ) + ) + + page_info = comments_connection.get("pageInfo") or {} + if not page_info.get("hasNextPage"): + break + + comments_cursor = page_info.get("endCursor") + comments_page += 1 + + discussion_data["comment_count"] = comment_total_count + discussion_data["comment_data"] = comments + return discussion_data + + +def backup_discussions(args, repo_cwd, repository): + discussion_cwd = os.path.join(repo_cwd, "discussions") + if args.skip_existing and os.path.isdir(discussion_cwd): + return + + if not get_graphql_auth(args): + logger.info( + "Skipping {0} discussions since GitHub GraphQL API requires authentication".format( + repository["full_name"] + ) + ) + return + + discussions_since = None + discussion_last_update_path = os.path.join(discussion_cwd, "last_update") + if args.incremental and os.path.exists(discussion_last_update_path): + with open(discussion_last_update_path) as f: + discussions_since = f.read().strip() + + logger.info("Retrieving {0} discussions".format(repository["full_name"])) + try: + ( + summaries, + newest_seen, + discussions_enabled, + total_count, + ) = retrieve_discussion_summaries(args, repository, since=discussions_since) + except Exception as e: + logger.warning( + "Unable to retrieve discussions for {0}, skipping: {1}".format( + repository["full_name"], e + ) + ) + return + + if not discussions_enabled: + logger.info( + "Discussions are not enabled for {0}, skipping".format( + repository["full_name"] + ) + ) + return + + mkdir_p(repo_cwd, discussion_cwd) + + if discussions_since: + logger.info( + "Saving {0} updated discussions to disk ({1} total)".format( + len(summaries), total_count + ) + ) + else: + logger.info("Saving {0} discussions to disk".format(len(summaries))) + + written_count = 0 + skipped_count = 0 + had_errors = False + for summary in summaries: + number = summary["number"] + discussion_file = os.path.join(discussion_cwd, "{0}.json".format(number)) + + if args.incremental_by_files and os.path.isfile(discussion_file): + modified = os.path.getmtime(discussion_file) + modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ") + if modified > summary["updatedAt"]: + logger.info( + "Skipping discussion {0} because it wasn't modified since last backup".format( + number + ) + ) + skipped_count += 1 + continue + + try: + discussion = retrieve_discussion(args, repository, number) + except Exception as e: logger.warning( - f"Repository {repository['full_name']} is unavailable (HTTP 451)" + "Unable to retrieve discussion {0}#{1}, skipping: {2}".format( + repository["full_name"], number, e + ) ) - if e.dmca_url: - logger.warning(f"DMCA notice: {e.dmca_url}") - logger.info(f"Skipping remaining resources for {repository['full_name']}") + had_errors = True continue - if args.incremental: - if last_update == "0000-00-00T00:00:00Z": - last_update = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) + if args.include_attachments: + download_attachments( + args, + discussion_cwd, + discussion, + number, + repository, + item_type="discussion", + ) - open(last_update_path, "w").write(last_update) + if json_dump_if_changed(discussion, discussion_file): + written_count += 1 + + if ( + args.incremental + and not had_errors + and newest_seen + and (not discussions_since or newest_seen > discussions_since) + ): + with open(discussion_last_update_path, "w") as f: + f.write(newest_seen) + + attempted_count = len(summaries) - skipped_count + if not summaries: + logger.info("No discussions to save") + elif attempted_count == 0: + logger.info("{0} discussions skipped".format(skipped_count)) + elif written_count == attempted_count: + logger.info("Saved {0} discussions to disk".format(written_count)) + elif written_count == 0: + logger.info( + "{0} discussions unchanged, skipped write".format(attempted_count) + ) + else: + logger.info( + "Saved {0} discussions to disk ({1} unchanged, {2} skipped)".format( + written_count, + attempted_count - written_count, + skipped_count, + ) + ) def backup_issues(args, repo_cwd, repository, repos_template): @@ -1794,6 +2562,58 @@ def backup_issues(args, repo_cwd, repository, repos_template): os.replace(issue_file + ".temp", issue_file) # Atomic write +PULL_OPTIONAL_DATA_KEYS = ( + "comment_regular_data", + "comment_data", + "commit_data", + "review_data", +) +PULL_REVIEWS_LAST_UPDATE_FILENAME = "reviews_last_update" + + +def read_json_file_if_exists(path): + if not os.path.isfile(path): + return None + + try: + with codecs.open(path, "r", encoding="utf-8") as f: + return json.load(f) + except (OSError, UnicodeDecodeError, json.decoder.JSONDecodeError) as e: + logger.debug("Error reading existing JSON file {0}: {1}".format(path, e)) + return None + + +def restore_existing_pull_optional_data(pull, existing_pull): + if not existing_pull: + return + + for key in PULL_OPTIONAL_DATA_KEYS: + if key not in pull and key in existing_pull: + pull[key] = existing_pull[key] + + +def get_pull_reviews_since(args, pulls_cwd): + args_since = getattr(args, "since", None) + if not args.incremental: + return args_since, None, None + + reviews_last_update_path = os.path.join( + pulls_cwd, PULL_REVIEWS_LAST_UPDATE_FILENAME + ) + if not os.path.exists(reviews_last_update_path): + # One-time backfill for existing incremental backups: if the user adds + # --pull-reviews after a repository checkpoint already exists, the + # repository-level checkpoint would otherwise skip old PRs forever. + return None, None, reviews_last_update_path + + with open(reviews_last_update_path) as f: + reviews_since = f.read().strip() + if args_since and reviews_since: + return min(args_since, reviews_since), reviews_since, reviews_last_update_path + + return args_since or reviews_since, reviews_since, reviews_last_update_path + + def backup_pulls(args, repo_cwd, repository, repos_template): has_pulls_dir = os.path.isdir("{0}/pulls/.git".format(repo_cwd)) if args.skip_existing and has_pulls_dir: @@ -1803,7 +2623,20 @@ def backup_pulls(args, repo_cwd, repository, repos_template): pulls_cwd = os.path.join(repo_cwd, "pulls") mkdir_p(repo_cwd, pulls_cwd) + include_pull_reviews = args.include_pull_reviews or args.include_everything + repository_since = getattr(args, "since", None) + pulls_since = repository_since + pull_reviews_since = None + pull_reviews_last_update_path = None + if include_pull_reviews: + ( + pulls_since, + pull_reviews_since, + pull_reviews_last_update_path, + ) = get_pull_reviews_since(args, pulls_cwd) + pulls = {} + newest_pull_update = None _pulls_template = "{0}/{1}/pulls".format(repos_template, repository["full_name"]) _issue_template = "{0}/{1}/issues".format(repos_template, repository["full_name"]) query_args = { @@ -1813,27 +2646,45 @@ def backup_pulls(args, repo_cwd, repository, repos_template): "direction": "desc", } + def track_newest_pull_update(pull): + nonlocal newest_pull_update + updated_at = pull.get("updated_at") + if updated_at and ( + newest_pull_update is None or updated_at > newest_pull_update + ): + newest_pull_update = updated_at + + def pull_is_due_for_repository_checkpoint(pull): + return not repository_since or pull["updated_at"] > repository_since + if not args.include_pull_details: pull_states = ["open", "closed"] for pull_state in pull_states: query_args["state"] = pull_state - _pulls = retrieve_data(args, _pulls_template, query_args=query_args) - for pull in _pulls: - if args.since and pull["updated_at"] < args.since: + for pull in retrieve_data( + args, _pulls_template, query_args=query_args, lazy=True + ): + track_newest_pull_update(pull) + if pulls_since and pull["updated_at"] <= pulls_since: break - if not args.since or pull["updated_at"] >= args.since: + if not pulls_since or pull["updated_at"] > pulls_since: pulls[pull["number"]] = pull else: - _pulls = retrieve_data(args, _pulls_template, query_args=query_args) - for pull in _pulls: - if args.since and pull["updated_at"] < args.since: + for pull in retrieve_data( + args, _pulls_template, query_args=query_args, lazy=True + ): + track_newest_pull_update(pull) + if pulls_since and pull["updated_at"] <= pulls_since: break - if not args.since or pull["updated_at"] >= args.since: - pulls[pull["number"]] = retrieve_data( - args, - _pulls_template + "/{}".format(pull["number"]), - paginated=False, - )[0] + if not pulls_since or pull["updated_at"] > pulls_since: + if pull_is_due_for_repository_checkpoint(pull): + pulls[pull["number"]] = retrieve_data( + args, + _pulls_template + "/{}".format(pull["number"]), + paginated=False, + )[0] + else: + pulls[pull["number"]] = pull logger.info("Saving {0} pull requests to disk".format(len(list(pulls.keys())))) # Comments from pulls API are only _review_ comments @@ -1843,24 +2694,50 @@ def backup_pulls(args, repo_cwd, repository, repos_template): comments_regular_template = _issue_template + "/{0}/comments" comments_template = _pulls_template + "/{0}/comments" commits_template = _pulls_template + "/{0}/commits" + reviews_template = _pulls_template + "/{0}/reviews" + pull_review_errors = False + for number, pull in list(pulls.items()): pull_file = "{0}/{1}.json".format(pulls_cwd, number) + existing_pull = read_json_file_if_exists(pull_file) + needs_review_backfill = ( + include_pull_reviews + and (not existing_pull or "review_data" not in existing_pull) + ) + if args.incremental_by_files and os.path.isfile(pull_file): modified = os.path.getmtime(pull_file) modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ") - if modified > pull["updated_at"]: + if modified > pull["updated_at"] and not needs_review_backfill: logger.info( "Skipping pull request {0} because it wasn't modified since last backup".format( number ) ) continue - if args.include_pull_comments or args.include_everything: + + should_fetch_non_review_data = pull_is_due_for_repository_checkpoint(pull) + if ( + args.include_pull_comments or args.include_everything + ) and should_fetch_non_review_data: template = comments_regular_template.format(number) pulls[number]["comment_regular_data"] = retrieve_data(args, template) template = comments_template.format(number) pulls[number]["comment_data"] = retrieve_data(args, template) - if args.include_pull_commits or args.include_everything: + if include_pull_reviews: + template = reviews_template.format(number) + try: + pulls[number]["review_data"] = retrieve_data(args, template) + except Exception as e: + pull_review_errors = True + logger.warning( + "Unable to retrieve reviews for pull request {0}#{1}, skipping reviews: {2}".format( + repository["full_name"], number, e + ) + ) + if ( + args.include_pull_commits or args.include_everything + ) and should_fetch_non_review_data: template = commits_template.format(number) pulls[number]["commit_data"] = retrieve_data(args, template) if args.include_attachments: @@ -1868,10 +2745,23 @@ def backup_pulls(args, repo_cwd, repository, repos_template): args, pulls_cwd, pulls[number], number, repository, item_type="pull" ) + restore_existing_pull_optional_data(pull, existing_pull) + with codecs.open(pull_file + ".temp", "w", encoding="utf-8") as f: json_dump(pull, f) os.replace(pull_file + ".temp", pull_file) # Atomic write + if ( + include_pull_reviews + and args.incremental + and pull_reviews_last_update_path + and newest_pull_update + and not pull_review_errors + and (not pull_reviews_since or newest_pull_update > pull_reviews_since) + ): + with open(pull_reviews_last_update_path, "w") as f: + f.write(newest_pull_update) + def backup_milestones(args, repo_cwd, repository, repos_template): milestone_cwd = os.path.join(repo_cwd, "milestones") @@ -1910,6 +2800,50 @@ def backup_milestones(args, repo_cwd, repository, repos_template): ) +def backup_security_advisories(args, repo_cwd, repository, repos_template): + advisory_cwd = os.path.join(repo_cwd, "security-advisories") + if args.skip_existing and os.path.isdir(advisory_cwd): + return + + logger.info("Retrieving {0} security advisories".format(repository["full_name"])) + + template = "{0}/{1}/security-advisories".format( + repos_template, repository["full_name"] + ) + + try: + _advisories = retrieve_data(args, template) + except Exception as e: + if "404" in str(e): + logger.info("Security advisories are not available for this repository, skipping") + return + raise + + mkdir_p(repo_cwd, advisory_cwd) + + advisories = {} + for advisory in _advisories: + advisories[advisory["ghsa_id"]] = advisory + + written_count = 0 + for ghsa_id, advisory in list(advisories.items()): + advisory_file = "{0}/{1}.json".format(advisory_cwd, ghsa_id) + if json_dump_if_changed(advisory, advisory_file): + written_count += 1 + + total = len(advisories) + if written_count == total: + logger.info("Saved {0} security advisories to disk".format(total)) + elif written_count == 0: + logger.info("{0} security advisories unchanged, skipped write".format(total)) + else: + logger.info( + "Saved {0} of {1} security advisories to disk ({2} unchanged)".format( + written_count, total, total - written_count + ) + ) + + def backup_labels(args, repo_cwd, repository, repos_template): label_cwd = os.path.join(repo_cwd, "labels") output_file = "{0}/labels.json".format(label_cwd) @@ -1987,7 +2921,12 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F written_count += 1 if include_assets and not skip_assets: - assets = retrieve_data(args, release["assets_url"]) + # The releases list API already includes release asset metadata. Use + # it to avoid an extra /releases/{id}/assets request per release. + # Keep a fallback for older/enterprise responses that might omit it. + assets = release.get("assets") + if assets is None: + assets = retrieve_data(args, release["assets_url"]) if len(assets) > 0: # give release asset files somewhere to live & download them (not including source archives) release_assets_cwd = os.path.join(release_cwd, release_name_safe) @@ -2043,7 +2982,8 @@ def fetch_repository( masked_remote_url = mask_password(remote_url) initialized = subprocess.call( - "git ls-remote " + remote_url, stdout=FNULL, stderr=FNULL, shell=True + ["git", "ls-remote", remote_url], stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) if initialized == 128: if ".wiki.git" in remote_url: diff --git a/github_backup/graphql_queries.py b/github_backup/graphql_queries.py new file mode 100644 index 0000000..96eb552 --- /dev/null +++ b/github_backup/graphql_queries.py @@ -0,0 +1,292 @@ +"""GraphQL query templates used by github-backup.""" + +DISCUSSION_PAGE_SIZE = 100 + +DISCUSSION_LIST_QUERY = """ +query($owner: String!, $name: String!, $after: String, $pageSize: Int!) { + repository(owner: $owner, name: $name) { + hasDiscussionsEnabled + discussions( + first: $pageSize, + after: $after, + orderBy: {field: UPDATED_AT, direction: DESC} + ) { + totalCount + nodes { + id + number + title + updatedAt + } + pageInfo { + hasNextPage + endCursor + } + } + } +} +""" + +DISCUSSION_DETAIL_QUERY = """ +query( + $owner: String!, + $name: String!, + $number: Int!, + $commentsCursor: String, + $pageSize: Int! +) { + repository(owner: $owner, name: $name) { + discussion(number: $number) { + activeLockReason + answer { + id + databaseId + url + } + answerChosenAt + answerChosenBy { + ...ActorFields + } + author { + ...ActorFields + } + authorAssociation + body + bodyHTML + bodyText + category { + createdAt + description + emoji + emojiHTML + id + isAnswerable + name + slug + updatedAt + } + closed + closedAt + createdAt + createdViaEmail + databaseId + editor { + ...ActorFields + } + id + includesCreatedEdit + isAnswered + labels(first: 100) { + totalCount + nodes { + id + name + color + description + } + } + lastEditedAt + locked + number + poll { + id + question + totalVoteCount + options(first: 100) { + totalCount + nodes { + id + option + totalVoteCount + } + } + } + publishedAt + reactionGroups { + ...ReactionGroupFields + } + resourcePath + stateReason + title + updatedAt + upvoteCount + url + comments(first: $pageSize, after: $commentsCursor) { + totalCount + nodes { + ...DiscussionCommentFields + replies(first: $pageSize) { + totalCount + nodes { + ...DiscussionReplyFields + } + pageInfo { + hasNextPage + endCursor + } + } + } + pageInfo { + hasNextPage + endCursor + } + } + } + } +} + +fragment ActorFields on Actor { + avatarUrl + login + resourcePath + url +} + +fragment ReactionGroupFields on ReactionGroup { + content + reactors { + totalCount + } +} + +fragment DiscussionCommentFields on DiscussionComment { + author { + ...ActorFields + } + authorAssociation + body + bodyHTML + bodyText + createdAt + createdViaEmail + databaseId + deletedAt + editor { + ...ActorFields + } + id + includesCreatedEdit + isAnswer + isMinimized + lastEditedAt + minimizedReason + publishedAt + reactionGroups { + ...ReactionGroupFields + } + replyTo { + id + databaseId + url + } + resourcePath + updatedAt + upvoteCount + url +} + +fragment DiscussionReplyFields on DiscussionComment { + author { + ...ActorFields + } + authorAssociation + body + bodyHTML + bodyText + createdAt + createdViaEmail + databaseId + deletedAt + editor { + ...ActorFields + } + id + includesCreatedEdit + isAnswer + isMinimized + lastEditedAt + minimizedReason + publishedAt + reactionGroups { + ...ReactionGroupFields + } + replyTo { + id + databaseId + url + } + resourcePath + updatedAt + upvoteCount + url +} +""" + +DISCUSSION_REPLIES_QUERY = """ +query($commentId: ID!, $repliesCursor: String, $pageSize: Int!) { + node(id: $commentId) { + ... on DiscussionComment { + replies(first: $pageSize, after: $repliesCursor) { + totalCount + nodes { + ...DiscussionReplyFields + } + pageInfo { + hasNextPage + endCursor + } + } + } + } +} + +fragment ActorFields on Actor { + avatarUrl + login + resourcePath + url +} + +fragment ReactionGroupFields on ReactionGroup { + content + reactors { + totalCount + } +} + +fragment DiscussionReplyFields on DiscussionComment { + author { + ...ActorFields + } + authorAssociation + body + bodyHTML + bodyText + createdAt + createdViaEmail + databaseId + deletedAt + editor { + ...ActorFields + } + id + includesCreatedEdit + isAnswer + isMinimized + lastEditedAt + minimizedReason + publishedAt + reactionGroups { + ...ReactionGroupFields + } + replyTo { + id + databaseId + url + } + resourcePath + updatedAt + upvoteCount + url +} +""" diff --git a/release-requirements.txt b/release-requirements.txt index dd2d73f..117aeea 100644 --- a/release-requirements.txt +++ b/release-requirements.txt @@ -1,15 +1,15 @@ # Linting & Formatting autopep8==2.3.2 -black==25.12.0 +black==26.5.1 flake8==7.3.0 # Testing -pytest==9.0.2 +pytest==9.0.3 # Release & Publishing twine==6.2.0 gitchangelog==3.0.4 -setuptools==80.9.0 +setuptools==82.0.1 # Documentation restructuredtext-lint==2.0.2 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..b36fe64 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,25 @@ +"""Shared pytest fixtures for github-backup tests.""" + +import pytest + +from github_backup.github_backup import parse_args + + +@pytest.fixture +def create_args(): + """Factory fixture that creates args with real CLI defaults. + + Uses the actual argument parser so new CLI args are automatically + available with their defaults - no test updates needed. + + Usage: + def test_something(self, create_args): + args = create_args(include_releases=True, user="myuser") + """ + def _create(**overrides): + # Use real parser to get actual defaults + args = parse_args(["testuser"]) + for key, value in overrides.items(): + setattr(args, key, value) + return args + return _create diff --git a/tests/test_all_starred.py b/tests/test_all_starred.py index 0fab048..9776926 100644 --- a/tests/test_all_starred.py +++ b/tests/test_all_starred.py @@ -1,7 +1,7 @@ """Tests for --all-starred flag behavior (issue #225).""" import pytest -from unittest.mock import Mock, patch +from unittest.mock import patch from github_backup import github_backup @@ -12,57 +12,14 @@ class TestAllStarredCloning: Issue #225: --all-starred should clone starred repos without requiring --repositories. """ - def _create_mock_args(self, **overrides): - """Create a mock args object with sensible defaults.""" - args = Mock() - args.user = "testuser" - args.output_directory = "/tmp/backup" - args.include_repository = False - args.include_everything = False - args.include_gists = False - args.include_starred_gists = False - args.all_starred = False - args.skip_existing = False - args.bare_clone = False - args.lfs_clone = False - args.no_prune = False - args.include_wiki = False - args.include_issues = False - args.include_issue_comments = False - args.include_issue_events = False - args.include_pulls = False - args.include_pull_comments = False - args.include_pull_commits = False - args.include_pull_details = False - args.include_labels = False - args.include_hooks = False - args.include_milestones = False - args.include_releases = False - args.include_assets = False - args.include_attachments = False - args.incremental = False - args.incremental_by_files = False - args.github_host = None - args.prefer_ssh = False - args.token_classic = None - args.token_fine = None - args.as_app = False - args.osx_keychain_item_name = None - args.osx_keychain_item_account = None - - for key, value in overrides.items(): - setattr(args, key, value) - - return args - @patch('github_backup.github_backup.fetch_repository') @patch('github_backup.github_backup.get_github_repo_url') - def test_all_starred_clones_without_repositories_flag(self, mock_get_url, mock_fetch): + def test_all_starred_clones_without_repositories_flag(self, mock_get_url, mock_fetch, create_args): """--all-starred should clone starred repos without --repositories flag. This is the core fix for issue #225. """ - args = self._create_mock_args(all_starred=True) + args = create_args(all_starred=True) mock_get_url.return_value = "https://github.com/otheruser/awesome-project.git" # A starred repository (is_starred flag set by retrieve_repositories) @@ -87,9 +44,9 @@ def test_all_starred_clones_without_repositories_flag(self, mock_get_url, mock_f @patch('github_backup.github_backup.fetch_repository') @patch('github_backup.github_backup.get_github_repo_url') - def test_starred_repo_not_cloned_without_all_starred_flag(self, mock_get_url, mock_fetch): + def test_starred_repo_not_cloned_without_all_starred_flag(self, mock_get_url, mock_fetch, create_args): """Starred repos should NOT be cloned if --all-starred is not set.""" - args = self._create_mock_args(all_starred=False) + args = create_args(all_starred=False) mock_get_url.return_value = "https://github.com/otheruser/awesome-project.git" starred_repo = { @@ -110,9 +67,9 @@ def test_starred_repo_not_cloned_without_all_starred_flag(self, mock_get_url, mo @patch('github_backup.github_backup.fetch_repository') @patch('github_backup.github_backup.get_github_repo_url') - def test_non_starred_repo_not_cloned_with_only_all_starred(self, mock_get_url, mock_fetch): + def test_non_starred_repo_not_cloned_with_only_all_starred(self, mock_get_url, mock_fetch, create_args): """Non-starred repos should NOT be cloned when only --all-starred is set.""" - args = self._create_mock_args(all_starred=True) + args = create_args(all_starred=True) mock_get_url.return_value = "https://github.com/testuser/my-project.git" # A regular (non-starred) repository @@ -134,9 +91,9 @@ def test_non_starred_repo_not_cloned_with_only_all_starred(self, mock_get_url, m @patch('github_backup.github_backup.fetch_repository') @patch('github_backup.github_backup.get_github_repo_url') - def test_repositories_flag_still_works(self, mock_get_url, mock_fetch): + def test_repositories_flag_still_works(self, mock_get_url, mock_fetch, create_args): """--repositories flag should still clone repos as before.""" - args = self._create_mock_args(include_repository=True) + args = create_args(include_repository=True) mock_get_url.return_value = "https://github.com/testuser/my-project.git" regular_repo = { diff --git a/tests/test_attachments.py b/tests/test_attachments.py index b338caf..241a08f 100644 --- a/tests/test_attachments.py +++ b/tests/test_attachments.py @@ -4,7 +4,7 @@ import os import tempfile from pathlib import Path -from unittest.mock import Mock +from unittest.mock import Mock, patch import pytest @@ -12,22 +12,13 @@ @pytest.fixture -def attachment_test_setup(tmp_path): +def attachment_test_setup(tmp_path, create_args): """Fixture providing setup and helper for attachment download tests.""" - from unittest.mock import patch - issue_cwd = tmp_path / "issues" issue_cwd.mkdir() - # Mock args - args = Mock() - args.as_app = False - args.token_fine = None - args.token_classic = None - args.osx_keychain_item_name = None - args.osx_keychain_item_account = None - args.user = "testuser" - args.repository = "testrepo" + # Create args using shared fixture + args = create_args(user="testuser", repository="testrepo") repository = {"full_name": "testuser/testrepo"} @@ -349,3 +340,146 @@ def test_manifest_skips_permanent_failures(self, attachment_test_setup): downloaded_urls[0] == "https://github.com/user-attachments/assets/unavailable" ) + + +class TestJWTWorkaround: + """Test JWT workaround for fine-grained tokens on private repos (issue #477).""" + + def test_markdown_api_extracts_jwt_url(self): + """Markdown API response with JWT URL is extracted correctly.""" + html_response = ( + '

' + ) + + mock_response = Mock() + mock_response.read.return_value = html_response.encode("utf-8") + + with patch("github_backup.github_backup.urlopen", return_value=mock_response): + result = github_backup.get_jwt_signed_url_via_markdown_api( + "https://github.com/user-attachments/assets/abc123", + "github_pat_token", + "owner/repo" + ) + + expected = ( + "https://private-user-images.githubusercontent.com" + "/123/abc.png?jwt=eyJhbGciOiJ" + ) + assert result == expected + + def test_markdown_api_returns_none_on_http_error(self): + """HTTP errors return None.""" + from urllib.error import HTTPError + + error = HTTPError("http://test", 403, "Forbidden", {}, None) + with patch("github_backup.github_backup.urlopen", side_effect=error): + result = github_backup.get_jwt_signed_url_via_markdown_api( + "https://github.com/user-attachments/assets/abc123", + "github_pat_token", + "owner/repo" + ) + + assert result is None + + def test_markdown_api_returns_none_when_no_jwt_url(self): + """Response without JWT URL returns None.""" + mock_response = Mock() + mock_response.read.return_value = b"

No image here

" + + with patch("github_backup.github_backup.urlopen", return_value=mock_response): + result = github_backup.get_jwt_signed_url_via_markdown_api( + "https://github.com/user-attachments/assets/abc123", + "github_pat_token", + "owner/repo" + ) + + assert result is None + + def test_needs_jwt_only_for_fine_grained_private_assets(self): + """needs_jwt is True only for fine-grained + private + /assets/ URL.""" + assets_url = "https://github.com/user-attachments/assets/abc123" + files_url = "https://github.com/user-attachments/files/123/doc.pdf" + token_fine = "github_pat_test" + private = True + public = False + + # Fine-grained + private + assets = True + needs_jwt = ( + token_fine is not None + and private + and "github.com/user-attachments/assets/" in assets_url + ) + assert needs_jwt is True + + # Fine-grained + private + files = False + needs_jwt = ( + token_fine is not None + and private + and "github.com/user-attachments/assets/" in files_url + ) + assert needs_jwt is False + + # Fine-grained + public + assets = False + needs_jwt = ( + token_fine is not None + and public + and "github.com/user-attachments/assets/" in assets_url + ) + assert needs_jwt is False + + def test_jwt_workaround_sets_manifest_flag(self, attachment_test_setup): + """Successful JWT workaround sets jwt_workaround flag in manifest.""" + setup = attachment_test_setup + setup["args"].token_fine = "github_pat_test" + setup["repository"]["private"] = True + + issue_data = {"body": "https://github.com/user-attachments/assets/abc123"} + + jwt_url = "https://private-user-images.githubusercontent.com/123/abc.png?jwt=token" + + with patch( + "github_backup.github_backup.get_jwt_signed_url_via_markdown_api", + return_value=jwt_url + ), patch( + "github_backup.github_backup.download_attachment_file", + return_value={"success": True, "http_status": 200, "url": jwt_url} + ): + github_backup.download_attachments( + setup["args"], setup["issue_cwd"], issue_data, 123, setup["repository"] + ) + + manifest_path = os.path.join(setup["issue_cwd"], "attachments", "123", "manifest.json") + with open(manifest_path) as f: + manifest = json.load(f) + + assert manifest["attachments"][0]["jwt_workaround"] is True + assert manifest["attachments"][0]["url"] == "https://github.com/user-attachments/assets/abc123" + + def test_jwt_workaround_failure_uses_skipped_at(self, attachment_test_setup): + """Failed JWT workaround uses skipped_at instead of downloaded_at.""" + setup = attachment_test_setup + setup["args"].token_fine = "github_pat_test" + setup["repository"]["private"] = True + + issue_data = {"body": "https://github.com/user-attachments/assets/abc123"} + + with patch( + "github_backup.github_backup.get_jwt_signed_url_via_markdown_api", + return_value=None # Markdown API failed + ): + github_backup.download_attachments( + setup["args"], setup["issue_cwd"], issue_data, 123, setup["repository"] + ) + + manifest_path = os.path.join(setup["issue_cwd"], "attachments", "123", "manifest.json") + with open(manifest_path) as f: + manifest = json.load(f) + + attachment = manifest["attachments"][0] + assert attachment["success"] is False + assert "skipped_at" in attachment + assert "downloaded_at" not in attachment + assert "Use --token-classic" in attachment["error"] diff --git a/tests/test_auth.py b/tests/test_auth.py new file mode 100644 index 0000000..0102878 --- /dev/null +++ b/tests/test_auth.py @@ -0,0 +1,75 @@ +"""Tests for authentication helpers.""" + +from unittest.mock import patch + +import pytest + +from github_backup import github_backup + + +def test_token_from_gh_flag_parses(): + args = github_backup.parse_args(["--token-from-gh", "testuser"]) + assert args.token_from_gh is True + + +def test_get_auth_reads_token_from_gh_cli(create_args): + args = create_args(token_from_gh=True) + + with patch( + "github_backup.github_backup.subprocess.check_output", + return_value=b"gho_test_token\n", + ) as mock_check_output: + auth = github_backup.get_auth(args, encode=False) + + assert auth == "gho_test_token:x-oauth-basic" + mock_check_output.assert_called_once_with( + ["gh", "auth", "token"], stderr=github_backup.subprocess.PIPE + ) + + +def test_get_auth_reads_token_from_gh_cli_for_enterprise_host(create_args): + args = create_args(token_from_gh=True, github_host="ghe.example.com") + + with patch( + "github_backup.github_backup.subprocess.check_output", + return_value=b"gho_enterprise_token\n", + ) as mock_check_output: + auth = github_backup.get_auth(args, encode=False) + + assert auth == "gho_enterprise_token:x-oauth-basic" + mock_check_output.assert_called_once_with( + ["gh", "auth", "token", "--hostname", "ghe.example.com"], + stderr=github_backup.subprocess.PIPE, + ) + + +def test_token_from_gh_is_cached(create_args): + args = create_args(token_from_gh=True) + + with patch( + "github_backup.github_backup.subprocess.check_output", + return_value=b"gho_cached_token\n", + ) as mock_check_output: + assert github_backup.get_auth(args, encode=False) == "gho_cached_token:x-oauth-basic" + assert github_backup.get_auth(args, encode=False) == "gho_cached_token:x-oauth-basic" + + mock_check_output.assert_called_once() + + +def test_graphql_auth_strips_basic_auth_suffix_for_gh_cli_token(create_args): + args = create_args(token_from_gh=True) + + with patch( + "github_backup.github_backup.subprocess.check_output", + return_value=b"gho_graphql_token\n", + ): + assert github_backup.get_graphql_auth(args) == "gho_graphql_token" + + +def test_token_from_gh_rejects_as_app(create_args): + args = create_args(token_from_gh=True, as_app=True) + + with pytest.raises(Exception) as exc_info: + github_backup.get_auth(args, encode=False) + + assert "--token-from-gh cannot be used with --as-app" in str(exc_info.value) diff --git a/tests/test_case_sensitivity.py b/tests/test_case_sensitivity.py index 058a7df..795c14b 100644 --- a/tests/test_case_sensitivity.py +++ b/tests/test_case_sensitivity.py @@ -1,7 +1,6 @@ """Tests for case-insensitive username/organization filtering.""" import pytest -from unittest.mock import Mock from github_backup import github_backup @@ -9,25 +8,14 @@ class TestCaseSensitivity: """Test suite for case-insensitive username matching in filter_repositories.""" - def test_filter_repositories_case_insensitive_user(self): + def test_filter_repositories_case_insensitive_user(self, create_args): """Should filter repositories case-insensitively for usernames. Reproduces issue #198 where typing 'iamrodos' fails to match repositories with owner.login='Iamrodos' (the canonical case from GitHub API). """ # Simulate user typing lowercase username - args = Mock() - args.user = "iamrodos" # lowercase (what user typed) - args.repository = None - args.name_regex = None - args.languages = None - args.exclude = None - args.fork = False - args.private = False - args.public = False - args.all = True - args.skip_archived = False - args.starred_skip_size_over = None + args = create_args(user="iamrodos") # Simulate GitHub API returning canonical case repos = [ @@ -52,23 +40,12 @@ def test_filter_repositories_case_insensitive_user(self): assert filtered[0]["name"] == "repo1" assert filtered[1]["name"] == "repo2" - def test_filter_repositories_case_insensitive_org(self): + def test_filter_repositories_case_insensitive_org(self, create_args): """Should filter repositories case-insensitively for organizations. Tests the example from issue #198 where 'prai-org' doesn't match 'PRAI-Org'. """ - args = Mock() - args.user = "prai-org" # lowercase (what user typed) - args.repository = None - args.name_regex = None - args.languages = None - args.exclude = None - args.fork = False - args.private = False - args.public = False - args.all = True - args.skip_archived = False - args.starred_skip_size_over = None + args = create_args(user="prai-org") repos = [ { @@ -85,20 +62,9 @@ def test_filter_repositories_case_insensitive_org(self): assert len(filtered) == 1 assert filtered[0]["name"] == "repo1" - def test_filter_repositories_case_variations(self): + def test_filter_repositories_case_variations(self, create_args): """Should handle various case combinations correctly.""" - args = Mock() - args.user = "TeSt-UsEr" # Mixed case - args.repository = None - args.name_regex = None - args.languages = None - args.exclude = None - args.fork = False - args.private = False - args.public = False - args.all = True - args.skip_archived = False - args.starred_skip_size_over = None + args = create_args(user="TeSt-UsEr") repos = [ {"name": "repo1", "owner": {"login": "test-user"}, "private": False, "fork": False}, diff --git a/tests/test_discussions.py b/tests/test_discussions.py new file mode 100644 index 0000000..2b5e3fb --- /dev/null +++ b/tests/test_discussions.py @@ -0,0 +1,257 @@ +"""Tests for GitHub Discussions backup support.""" + +import json +import os +from unittest.mock import patch + +from github_backup import github_backup + + +def test_parse_args_discussions_flag(): + args = github_backup.parse_args(["--discussions", "testuser"]) + assert args.include_discussions is True + + +def test_retrieve_discussion_summaries_stops_at_incremental_since(create_args): + args = create_args() + repository = {"full_name": "owner/repo"} + + page = { + "repository": { + "hasDiscussionsEnabled": True, + "discussions": { + "totalCount": 3, + "nodes": [ + {"number": 3, "title": "new", "updatedAt": "2026-02-01T00:00:00Z"}, + {"number": 2, "title": "also new", "updatedAt": "2026-01-10T00:00:00Z"}, + {"number": 1, "title": "old", "updatedAt": "2025-12-01T00:00:00Z"}, + ], + "pageInfo": {"hasNextPage": True, "endCursor": "NEXT"}, + }, + } + } + + with patch( + "github_backup.github_backup.retrieve_graphql_data", return_value=page + ) as mock_retrieve: + summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries( + args, repository, since="2026-01-01T00:00:00Z" + ) + + assert enabled is True + assert total == 3 + assert newest == "2026-02-01T00:00:00Z" + assert [item["number"] for item in summaries] == [3, 2] + # The old discussion stops pagination, so the next page is not requested. + assert mock_retrieve.call_count == 1 + assert ( + mock_retrieve.call_args.kwargs["log_context"] + == "discussion summaries owner/repo page 1" + ) + + +def test_retrieve_discussion_summaries_excludes_checkpoint_timestamp(create_args): + args = create_args() + repository = {"full_name": "owner/repo"} + + page = { + "repository": { + "hasDiscussionsEnabled": True, + "discussions": { + "totalCount": 1, + "nodes": [ + { + "number": 1, + "title": "already backed up", + "updatedAt": "2026-01-01T00:00:00Z", + }, + ], + "pageInfo": {"hasNextPage": True, "endCursor": "NEXT"}, + }, + } + } + + with patch( + "github_backup.github_backup.retrieve_graphql_data", return_value=page + ) as mock_retrieve: + summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries( + args, repository, since="2026-01-01T00:00:00Z" + ) + + assert enabled is True + assert total == 1 + assert newest == "2026-01-01T00:00:00Z" + assert summaries == [] + assert mock_retrieve.call_count == 1 + + +def test_retrieve_discussion_summaries_disabled_discussions(create_args): + args = create_args() + repository = {"full_name": "owner/repo"} + + with patch( + "github_backup.github_backup.retrieve_graphql_data", + return_value={"repository": {"hasDiscussionsEnabled": False}}, + ): + summaries, newest, enabled, total = github_backup.retrieve_discussion_summaries( + args, repository + ) + + assert summaries == [] + assert newest is None + assert enabled is False + assert total == 0 + + +def _comment(comment_id, body, replies=None, replies_has_next=False): + replies = replies or [] + return { + "id": comment_id, + "body": body, + "replies": { + "totalCount": len(replies) + (1 if replies_has_next else 0), + "nodes": replies, + "pageInfo": { + "hasNextPage": replies_has_next, + "endCursor": "REPLIES2" if replies_has_next else None, + }, + }, + } + + +def _discussion_page(comment_nodes, has_next=False): + return { + "repository": { + "discussion": { + "number": 42, + "title": "Discussion title", + "updatedAt": "2026-02-01T00:00:00Z", + "comments": { + "totalCount": 2, + "nodes": comment_nodes, + "pageInfo": { + "hasNextPage": has_next, + "endCursor": "COMMENTS2" if has_next else None, + }, + }, + } + } + } + + +def test_retrieve_discussion_paginates_comments_and_replies(create_args): + args = create_args() + repository = {"full_name": "owner/repo"} + + reply_1 = {"id": "reply-1", "body": "first reply"} + reply_2 = {"id": "reply-2", "body": "second reply"} + comment_1 = _comment("comment-1", "first comment", [reply_1], replies_has_next=True) + comment_2 = _comment("comment-2", "second comment") + + responses = [ + _discussion_page([comment_1], has_next=True), + { + "node": { + "replies": { + "totalCount": 2, + "nodes": [reply_2], + "pageInfo": {"hasNextPage": False, "endCursor": None}, + } + } + }, + _discussion_page([comment_2], has_next=False), + ] + + with patch( + "github_backup.github_backup.retrieve_graphql_data", side_effect=responses + ) as mock_retrieve: + discussion = github_backup.retrieve_discussion(args, repository, 42) + + assert discussion["number"] == 42 + assert discussion["comment_count"] == 2 + assert len(discussion["comment_data"]) == 2 + assert discussion["comment_data"][0]["body"] == "first comment" + assert discussion["comment_data"][0]["reply_count"] == 2 + assert [r["body"] for r in discussion["comment_data"][0]["reply_data"]] == [ + "first reply", + "second reply", + ] + assert discussion["comment_data"][1]["body"] == "second comment" + assert mock_retrieve.call_count == 3 + assert [ + call.kwargs["log_context"] for call in mock_retrieve.call_args_list + ] == [ + "discussion owner/repo#42 details/comments page 1", + "discussion owner/repo#42 comment comment-1 replies page 2", + "discussion owner/repo#42 details/comments page 2", + ] + + +def test_backup_discussions_uses_incremental_checkpoint(create_args, tmp_path): + args = create_args(token_classic="fake_token", include_discussions=True, incremental=True) + repository = {"full_name": "owner/repo"} + discussions_dir = tmp_path / "discussions" + discussions_dir.mkdir() + (discussions_dir / "last_update").write_text("2026-01-01T00:00:00Z") + + def fake_summaries(passed_args, passed_repository, since=None): + assert passed_args is args + assert passed_repository == repository + assert since == "2026-01-01T00:00:00Z" + return ( + [{"number": 7, "title": "updated", "updatedAt": "2026-02-01T00:00:00Z"}], + "2026-02-01T00:00:00Z", + True, + 1, + ) + + with patch( + "github_backup.github_backup.retrieve_discussion_summaries", + side_effect=fake_summaries, + ), patch( + "github_backup.github_backup.retrieve_discussion", + return_value={"number": 7, "title": "updated"}, + ): + github_backup.backup_discussions(args, tmp_path, repository) + + with open(discussions_dir / "7.json", encoding="utf-8") as f: + assert json.load(f) == {"number": 7, "title": "updated"} + assert (discussions_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + + +def test_backup_discussions_does_not_advance_checkpoint_on_discussion_error( + create_args, tmp_path +): + args = create_args(token_classic="fake_token", include_discussions=True, incremental=True) + repository = {"full_name": "owner/repo"} + discussions_dir = tmp_path / "discussions" + discussions_dir.mkdir() + (discussions_dir / "last_update").write_text("2026-01-01T00:00:00Z") + + with patch( + "github_backup.github_backup.retrieve_discussion_summaries", + return_value=( + [{"number": 7, "title": "updated", "updatedAt": "2026-02-01T00:00:00Z"}], + "2026-02-01T00:00:00Z", + True, + 1, + ), + ), patch( + "github_backup.github_backup.retrieve_discussion", + side_effect=Exception("temporary GraphQL error"), + ): + github_backup.backup_discussions(args, tmp_path, repository) + + assert (discussions_dir / "last_update").read_text() == "2026-01-01T00:00:00Z" + assert not os.path.exists(discussions_dir / "7.json") + + +def test_backup_discussions_skips_without_auth(create_args, tmp_path): + args = create_args(include_discussions=True) + repository = {"full_name": "owner/repo"} + + with patch("github_backup.github_backup.retrieve_discussion_summaries") as mock_retrieve: + github_backup.backup_discussions(args, tmp_path, repository) + + assert not mock_retrieve.called + assert not os.path.exists(tmp_path / "discussions") diff --git a/tests/test_http_451.py b/tests/test_http_451.py index d53d65c..bba866e 100644 --- a/tests/test_http_451.py +++ b/tests/test_http_451.py @@ -1,93 +1,202 @@ -"""Tests for HTTP 451 (DMCA takedown) handling.""" +"""Tests for HTTP 451 (DMCA takedown) and HTTP 403 (TOS) handling.""" +import io import json -from unittest.mock import Mock, patch +from unittest.mock import patch +from urllib.error import HTTPError import pytest from github_backup import github_backup +def _make_http_error(code, body_bytes, msg="Error", headers=None): + """Create an HTTPError with a readable body (like a real urllib response).""" + if headers is None: + headers = {"x-ratelimit-remaining": "5000"} + return HTTPError( + url="https://api.github.com/repos/test/repo", + code=code, + msg=msg, + hdrs=headers, + fp=io.BytesIO(body_bytes), + ) + + class TestHTTP451Exception: """Test suite for HTTP 451 DMCA takedown exception handling.""" - def test_repository_unavailable_error_raised(self): + def test_repository_unavailable_error_raised(self, create_args): """HTTP 451 should raise RepositoryUnavailableError with DMCA URL.""" - args = Mock() - args.as_app = False - args.token_fine = None - args.token_classic = None - args.osx_keychain_item_name = None - args.osx_keychain_item_account = None - args.throttle_limit = None - args.throttle_pause = 0 - - mock_response = Mock() - mock_response.getcode.return_value = 451 + args = create_args() dmca_data = { "message": "Repository access blocked", "block": { "reason": "dmca", "created_at": "2024-11-12T14:38:04Z", - "html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md" - } + "html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md", + }, } - mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8") - mock_response.headers = {"x-ratelimit-remaining": "5000"} - mock_response.reason = "Unavailable For Legal Reasons" + body = json.dumps(dmca_data).encode("utf-8") - with patch("github_backup.github_backup.make_request_with_retry", return_value=mock_response): - with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: - github_backup.retrieve_data(args, "https://api.github.com/repos/test/dmca/issues") + def mock_urlopen(*a, **kw): + raise _make_http_error(451, body, msg="Unavailable For Legal Reasons") - assert exc_info.value.dmca_url == "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md" + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/dmca/issues" + ) + + assert ( + exc_info.value.legal_url + == "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md" + ) assert "451" in str(exc_info.value) - def test_repository_unavailable_error_without_dmca_url(self): + def test_repository_unavailable_error_without_legal_url(self, create_args): """HTTP 451 without DMCA details should still raise exception.""" - args = Mock() - args.as_app = False - args.token_fine = None - args.token_classic = None - args.osx_keychain_item_name = None - args.osx_keychain_item_account = None - args.throttle_limit = None - args.throttle_pause = 0 - - mock_response = Mock() - mock_response.getcode.return_value = 451 - mock_response.read.return_value = b'{"message": "Blocked"}' - mock_response.headers = {"x-ratelimit-remaining": "5000"} - mock_response.reason = "Unavailable For Legal Reasons" - - with patch("github_backup.github_backup.make_request_with_retry", return_value=mock_response): + args = create_args() + + def mock_urlopen(*a, **kw): + raise _make_http_error(451, b'{"message": "Blocked"}') + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: - github_backup.retrieve_data(args, "https://api.github.com/repos/test/dmca/issues") + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/dmca/issues" + ) - assert exc_info.value.dmca_url is None + assert exc_info.value.legal_url is None assert "451" in str(exc_info.value) - def test_repository_unavailable_error_with_malformed_json(self): + def test_repository_unavailable_error_with_malformed_json(self, create_args): """HTTP 451 with malformed JSON should still raise exception.""" - args = Mock() - args.as_app = False - args.token_fine = None - args.token_classic = None - args.osx_keychain_item_name = None - args.osx_keychain_item_account = None - args.throttle_limit = None - args.throttle_pause = 0 - - mock_response = Mock() - mock_response.getcode.return_value = 451 - mock_response.read.return_value = b"invalid json {" - mock_response.headers = {"x-ratelimit-remaining": "5000"} - mock_response.reason = "Unavailable For Legal Reasons" - - with patch("github_backup.github_backup.make_request_with_retry", return_value=mock_response): + args = create_args() + + def mock_urlopen(*a, **kw): + raise _make_http_error(451, b"invalid json {") + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): with pytest.raises(github_backup.RepositoryUnavailableError): - github_backup.retrieve_data(args, "https://api.github.com/repos/test/dmca/issues") + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/dmca/issues" + ) + + +class TestHTTP403TOS: + """Test suite for HTTP 403 TOS violation handling.""" + + def test_403_tos_raises_repository_unavailable(self, create_args): + """HTTP 403 (non-rate-limit) should raise RepositoryUnavailableError.""" + args = create_args() + + tos_data = { + "message": "Repository access blocked", + "block": { + "reason": "tos", + "html_url": "https://github.com/contact/tos-violation", + }, + } + body = json.dumps(tos_data).encode("utf-8") + + def mock_urlopen(*a, **kw): + raise _make_http_error( + 403, + body, + msg="Forbidden", + headers={"x-ratelimit-remaining": "5000"}, + ) + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/blocked/issues" + ) + + assert ( + exc_info.value.legal_url == "https://github.com/contact/tos-violation" + ) + assert "403" in str(exc_info.value) + + def test_403_permission_denied_not_converted(self, create_args): + """HTTP 403 without 'block' in body should propagate as HTTPError, not RepositoryUnavailableError.""" + args = create_args() + + body = json.dumps({"message": "Must have admin rights to Repository."}).encode( + "utf-8" + ) + + def mock_urlopen(*a, **kw): + raise _make_http_error( + 403, + body, + msg="Forbidden", + headers={"x-ratelimit-remaining": "5000"}, + ) + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + with pytest.raises(HTTPError) as exc_info: + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/private/issues" + ) + + assert exc_info.value.code == 403 + + def test_403_rate_limit_not_converted(self, create_args): + """HTTP 403 with rate limit exhausted should NOT become RepositoryUnavailableError.""" + args = create_args() + + call_count = 0 + + def mock_urlopen(*a, **kw): + nonlocal call_count + call_count += 1 + raise _make_http_error( + 403, + b'{"message": "rate limit"}', + msg="Forbidden", + headers={"x-ratelimit-remaining": "0"}, + ) + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + with patch( + "github_backup.github_backup.calculate_retry_delay", return_value=0 + ): + with pytest.raises(HTTPError) as exc_info: + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/ratelimit/issues" + ) + + assert exc_info.value.code == 403 + # Should have retried (not raised immediately as RepositoryUnavailableError) + assert call_count > 1 + + +class TestRetrieveRepositoriesUnavailable: + """Test that retrieve_repositories handles RepositoryUnavailableError gracefully.""" + + def test_unavailable_repo_returns_empty_list(self, create_args): + """retrieve_repositories should return [] when the repo is unavailable.""" + args = create_args(repository="blocked-repo") + + def mock_urlopen(*a, **kw): + raise _make_http_error( + 451, + json.dumps( + { + "message": "Blocked", + "block": {"html_url": "https://example.com/dmca"}, + } + ).encode("utf-8"), + msg="Unavailable For Legal Reasons", + ) + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + repos = github_backup.retrieve_repositories(args, {"login": None}) + + assert repos == [] if __name__ == "__main__": diff --git a/tests/test_incremental_per_repository.py b/tests/test_incremental_per_repository.py new file mode 100644 index 0000000..f1fd67a --- /dev/null +++ b/tests/test_incremental_per_repository.py @@ -0,0 +1,189 @@ +"""Tests for per-resource incremental checkpoints.""" + +import json +import os + +from github_backup import github_backup + + +def _repo(name, updated_at, pushed_at=None): + return { + "name": name, + "full_name": "owner/{0}".format(name), + "owner": {"login": "owner"}, + "clone_url": "https://github.com/owner/{0}.git".format(name), + "private": False, + "fork": False, + "has_wiki": False, + "updated_at": updated_at, + "pushed_at": pushed_at, + } + + +def test_incremental_uses_per_resource_last_update( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repositories = [ + _repo("repo-one", "2026-02-01T00:00:00Z"), + _repo("repo-two", "2026-03-01T00:00:00Z"), + ] + repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues" + repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues" + repo_one_issues.mkdir(parents=True) + repo_two_issues.mkdir(parents=True) + (repo_one_issues / "last_update").write_text("2026-01-01T00:00:00Z") + (repo_two_issues / "last_update").write_text("2025-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append((repository["name"], passed_args.since)) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, repositories) + + assert seen_since == [ + ("repo-one", "2026-01-01T00:00:00Z"), + ("repo-two", "2025-01-01T00:00:00Z"), + ] + assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert (repo_two_issues / "last_update").read_text() == "2026-03-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_uses_independent_issue_and_pull_checkpoints( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True, include_pulls=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + repo_dir = tmp_path / "repositories" / "repo-one" + issues_dir = repo_dir / "issues" + pulls_dir = repo_dir / "pulls" + issues_dir.mkdir(parents=True) + pulls_dir.mkdir(parents=True) + (issues_dir / "last_update").write_text("2026-01-01T00:00:00Z") + (pulls_dir / "last_update").write_text("2025-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(("issues", passed_args.since)) + + def fake_backup_pulls(passed_args, repo_cwd, repository, repos_template): + seen_since.append(("pulls", passed_args.since)) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + monkeypatch.setattr(github_backup, "backup_pulls", fake_backup_pulls) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == [ + ("issues", "2026-01-01T00:00:00Z"), + ("pulls", "2025-01-01T00:00:00Z"), + ] + assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert (pulls_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + + +def test_incremental_uses_legacy_global_last_update_for_existing_resource_backup( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + issues_dir = tmp_path / "repositories" / "repo-one" / "issues" + issues_dir.mkdir(parents=True) + with open(issues_dir / "1.json", "w", encoding="utf-8") as f: + json.dump({"number": 1}, f) + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(passed_args.since) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == ["2026-01-01T00:00:00Z"] + assert (issues_dir / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_does_not_use_legacy_global_last_update_for_new_resource_backup( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2099-01-01T00:00:00Z") + + seen_since = [] + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + seen_since.append(passed_args.since) + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert seen_since == [None] + assert ( + tmp_path / "repositories" / "repo-one" / "issues" / "last_update" + ).read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(tmp_path / "last_update") + + +def test_incremental_keeps_legacy_global_last_update_until_all_existing_resources_migrated( + create_args, tmp_path, monkeypatch +): + args = create_args(incremental=True, include_issues=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + repo_one_issues = tmp_path / "repositories" / "repo-one" / "issues" + repo_two_issues = tmp_path / "repositories" / "repo-two" / "issues" + repo_one_issues.mkdir(parents=True) + repo_two_issues.mkdir(parents=True) + with open(repo_one_issues / "1.json", "w", encoding="utf-8") as f: + json.dump({"number": 1}, f) + with open(repo_two_issues / "2.json", "w", encoding="utf-8") as f: + json.dump({"number": 2}, f) + + def fake_backup_issues(passed_args, repo_cwd, repository, repos_template): + pass + + monkeypatch.setattr(github_backup, "backup_issues", fake_backup_issues) + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert (repo_one_issues / "last_update").read_text() == "2026-02-01T00:00:00Z" + assert not os.path.exists(repo_two_issues / "last_update") + assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z" + + +def test_incremental_does_not_remove_legacy_checkpoint_without_resource_work( + create_args, tmp_path +): + args = create_args(incremental=True, include_repository=True) + repository = _repo("repo-one", "2026-02-01T00:00:00Z") + (tmp_path / "last_update").write_text("2026-01-01T00:00:00Z") + + github_backup.backup_repositories(args, tmp_path, [repository]) + + assert (tmp_path / "last_update").read_text() == "2026-01-01T00:00:00Z" + assert not os.path.exists( + tmp_path / "repositories" / "repo-one" / "issues" / "last_update" + ) + + +def test_repository_checkpoint_time_uses_newest_available_repo_timestamp(): + repository = _repo( + "repo-one", + updated_at="2026-02-01T00:00:00Z", + pushed_at="2026-03-01T00:00:00Z", + ) + + assert github_backup.get_repository_checkpoint_time(repository) == ( + "2026-03-01T00:00:00Z" + ) diff --git a/tests/test_pagination.py b/tests/test_pagination.py index 831b913..1931042 100644 --- a/tests/test_pagination.py +++ b/tests/test_pagination.py @@ -1,9 +1,7 @@ """Tests for Link header pagination handling.""" import json -from unittest.mock import Mock, patch - -import pytest +from unittest.mock import patch from github_backup import github_backup @@ -38,22 +36,9 @@ def headers(self): return headers -@pytest.fixture -def mock_args(): - """Mock args for retrieve_data.""" - args = Mock() - args.as_app = False - args.token_fine = None - args.token_classic = "fake_token" - args.osx_keychain_item_name = None - args.osx_keychain_item_account = None - args.throttle_limit = None - args.throttle_pause = 0 - return args - - -def test_cursor_based_pagination(mock_args): +def test_cursor_based_pagination(create_args): """Link header with 'after' cursor parameter works correctly.""" + args = create_args(token_classic="fake_token") # Simulate issues endpoint behavior: returns cursor in Link header responses = [ @@ -76,7 +61,7 @@ def mock_urlopen(request, *args, **kwargs): with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): results = github_backup.retrieve_data( - mock_args, "https://api.github.com/repos/owner/repo/issues" + args, "https://api.github.com/repos/owner/repo/issues" ) # Verify all items retrieved and cursor was used in second request @@ -85,8 +70,9 @@ def mock_urlopen(request, *args, **kwargs): assert "after=ABC123" in requests_made[1] -def test_page_based_pagination(mock_args): +def test_page_based_pagination(create_args): """Link header with 'page' parameter works correctly.""" + args = create_args(token_classic="fake_token") # Simulate pulls/repos endpoint behavior: returns page numbers in Link header responses = [ @@ -109,7 +95,7 @@ def mock_urlopen(request, *args, **kwargs): with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): results = github_backup.retrieve_data( - mock_args, "https://api.github.com/repos/owner/repo/pulls" + args, "https://api.github.com/repos/owner/repo/pulls" ) # Verify all items retrieved and page parameter was used (not cursor) @@ -119,8 +105,9 @@ def mock_urlopen(request, *args, **kwargs): assert "after" not in requests_made[1] -def test_no_link_header_stops_pagination(mock_args): +def test_no_link_header_stops_pagination(create_args): """Pagination stops when Link header is absent.""" + args = create_args(token_classic="fake_token") # Simulate endpoint with results that fit in a single page responses = [ @@ -137,7 +124,7 @@ def mock_urlopen(request, *args, **kwargs): with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): results = github_backup.retrieve_data( - mock_args, "https://api.github.com/repos/owner/repo/labels" + args, "https://api.github.com/repos/owner/repo/labels" ) # Verify pagination stopped after first request diff --git a/tests/test_pull_incremental_pagination.py b/tests/test_pull_incremental_pagination.py new file mode 100644 index 0000000..ac0f83f --- /dev/null +++ b/tests/test_pull_incremental_pagination.py @@ -0,0 +1,131 @@ +"""Tests for incremental pull request pagination.""" + +import json +import os +from unittest.mock import patch + +from github_backup import github_backup + + +class MockHTTPResponse: + def __init__(self, data, link_header=None): + self._content = json.dumps(data).encode("utf-8") + self._link_header = link_header + self._read = False + self.reason = "OK" + + def getcode(self): + return 200 + + def read(self): + if self._read: + return b"" + self._read = True + return self._content + + @property + def headers(self): + headers = {"x-ratelimit-remaining": "5000"} + if self._link_header: + headers["Link"] = self._link_header + return headers + + +def test_backup_pulls_incremental_excludes_checkpoint_timestamp(create_args, tmp_path): + args = create_args(include_pulls=True, incremental=True) + args.since = "2026-04-26T08:13:46Z" + repository = {"full_name": "owner/repo"} + + responses = [ + MockHTTPResponse([]), + MockHTTPResponse( + [ + { + "number": 1, + "title": "already backed up", + "updated_at": "2026-04-26T08:13:46Z", + }, + ], + link_header='; rel="next"', + ), + MockHTTPResponse( + [ + { + "number": 0, + "title": "older pull on page 2", + "updated_at": "2026-04-25T07:00:00Z", + } + ] + ), + ] + requests_made = [] + + def mock_urlopen(request, *args, **kwargs): + requests_made.append(request.get_full_url()) + return responses[len(requests_made) - 1] + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert len(requests_made) == 2 + assert "state=open" in requests_made[0] + assert "state=closed" in requests_made[1] + assert all("page=2" not in url for url in requests_made) + assert not os.path.exists(tmp_path / "pulls" / "1.json") + assert not os.path.exists(tmp_path / "pulls" / "0.json") + + +def test_backup_pulls_incremental_stops_before_fetching_old_pages( + create_args, tmp_path +): + args = create_args(include_pulls=True, incremental=True) + args.since = "2026-04-26T08:13:46Z" + repository = {"full_name": "owner/repo"} + + responses = [ + MockHTTPResponse([]), + MockHTTPResponse( + [ + { + "number": 2, + "title": "new pull", + "updated_at": "2026-04-26T09:00:00Z", + }, + { + "number": 1, + "title": "old pull", + "updated_at": "2026-04-26T07:00:00Z", + }, + ], + link_header='; rel="next"', + ), + MockHTTPResponse( + [ + { + "number": 0, + "title": "older pull on page 2", + "updated_at": "2026-04-25T07:00:00Z", + } + ] + ), + ] + requests_made = [] + + def mock_urlopen(request, *args, **kwargs): + requests_made.append(request.get_full_url()) + return responses[len(requests_made) - 1] + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert len(requests_made) == 2 + assert "state=open" in requests_made[0] + assert "state=closed" in requests_made[1] + assert all("page=2" not in url for url in requests_made) + assert os.path.exists(tmp_path / "pulls" / "2.json") + assert not os.path.exists(tmp_path / "pulls" / "1.json") + assert not os.path.exists(tmp_path / "pulls" / "0.json") diff --git a/tests/test_pull_reviews.py b/tests/test_pull_reviews.py new file mode 100644 index 0000000..2ce9ad1 --- /dev/null +++ b/tests/test_pull_reviews.py @@ -0,0 +1,237 @@ +"""Tests for pull request review backups.""" + +import json +import os + +from github_backup import github_backup + + +def test_parse_args_pull_reviews_flag(): + args = github_backup.parse_args(["--pull-reviews", "testuser"]) + assert args.include_pull_reviews is True + + +def test_backup_pulls_includes_review_data(create_args, tmp_path, monkeypatch): + args = create_args(include_pulls=True, include_pull_reviews=True) + repository = {"full_name": "owner/repo"} + calls = [] + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): + calls.append((template, query_args)) + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2026-02-01T00:00:00Z", + "title": "Add feature", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [ + { + "id": 123, + "state": "APPROVED", + "body": "Looks good", + "submitted_at": "2026-02-01T00:00:00Z", + } + ] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(tmp_path / "pulls" / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [ + { + "body": "Looks good", + "id": 123, + "state": "APPROVED", + "submitted_at": "2026-02-01T00:00:00Z", + } + ] + assert ( + "https://api.github.com/repos/owner/repo/pulls/1/reviews", + None, + ) in calls + + +def test_pull_reviews_backfill_ignores_repository_checkpoint( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-01-01T00:00:00Z", + "title": "Old pull request", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "APPROVED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(tmp_path / "pulls" / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [{"id": 123, "state": "APPROVED"}] + assert (tmp_path / "pulls" / "reviews_last_update").read_text() == ( + "2025-01-01T00:00:00Z" + ) + + +def test_pull_reviews_uses_review_checkpoint_when_older_than_repository_checkpoint( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-06-01T00:00:00Z", + "title": "Review changed while feature was disabled", + }, + { + "number": 2, + "updated_at": "2024-12-01T00:00:00Z", + "title": "Too old", + }, + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "COMMENTED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert os.path.exists(tmp_path / "pulls" / "1.json") + assert not os.path.exists(tmp_path / "pulls" / "2.json") + assert (tmp_path / "pulls" / "reviews_last_update").read_text() == ( + "2025-06-01T00:00:00Z" + ) + + +def test_pull_reviews_preserves_existing_optional_pull_data( + create_args, tmp_path, monkeypatch +): + args = create_args(include_pulls=True, include_pull_reviews=True) + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + with open(pulls_dir / "1.json", "w", encoding="utf-8") as f: + json.dump( + { + "number": 1, + "updated_at": "2026-01-01T00:00:00Z", + "comment_data": [{"id": 10, "body": "inline comment"}], + "comment_regular_data": [{"id": 11, "body": "regular comment"}], + "commit_data": [{"sha": "abc"}], + }, + f, + ) + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2026-02-01T00:00:00Z", + "title": "Add reviews", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + return [{"id": 123, "state": "APPROVED"}] + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + with open(pulls_dir / "1.json", encoding="utf-8") as f: + pull = json.load(f) + + assert pull["review_data"] == [{"id": 123, "state": "APPROVED"}] + assert pull["comment_data"] == [{"id": 10, "body": "inline comment"}] + assert pull["comment_regular_data"] == [{"id": 11, "body": "regular comment"}] + assert pull["commit_data"] == [{"sha": "abc"}] + + +def test_pull_reviews_does_not_advance_checkpoint_on_review_error( + create_args, tmp_path, monkeypatch +): + args = create_args( + include_pulls=True, + include_pull_reviews=True, + incremental=True, + ) + args.since = "2026-01-01T00:00:00Z" + repository = {"full_name": "owner/repo"} + pulls_dir = tmp_path / "pulls" + pulls_dir.mkdir() + (pulls_dir / "reviews_last_update").write_text("2025-01-01T00:00:00Z") + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): + if template == "https://api.github.com/repos/owner/repo/pulls": + if query_args["state"] == "open": + return [ + { + "number": 1, + "updated_at": "2025-06-01T00:00:00Z", + "title": "Review retrieval fails", + } + ] + return [] + if template == "https://api.github.com/repos/owner/repo/pulls/1/reviews": + raise Exception("temporary API failure") + raise AssertionError("Unexpected template: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_pulls( + args, tmp_path, repository, "https://api.github.com/repos" + ) + + assert (pulls_dir / "reviews_last_update").read_text() == "2025-01-01T00:00:00Z" diff --git a/tests/test_releases.py b/tests/test_releases.py new file mode 100644 index 0000000..b8584f4 --- /dev/null +++ b/tests/test_releases.py @@ -0,0 +1,95 @@ +"""Tests for release backup behavior.""" + +from github_backup import github_backup + + +def test_backup_releases_uses_embedded_assets_without_extra_asset_list_request( + create_args, tmp_path, monkeypatch +): + args = create_args(include_releases=True, include_assets=True) + repository = {"full_name": "owner/repo", "name": "repo"} + calls = [] + downloads = [] + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): + calls.append(template) + if template == "https://api.github.com/repos/owner/repo/releases": + return [ + { + "tag_name": "v1.0.0", + "created_at": "2026-01-01T00:00:00Z", + "updated_at": "2026-01-01T00:00:00Z", + "prerelease": False, + "draft": False, + "assets_url": "https://api.github.com/repos/owner/repo/releases/1/assets", + "assets": [ + { + "name": "artifact.zip", + "url": "https://api.github.com/repos/owner/repo/releases/assets/1", + } + ], + } + ] + raise AssertionError("Unexpected API request: {0}".format(template)) + + def fake_download_file(url, path, auth, as_app=False, fine=False): + downloads.append((url, path)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + monkeypatch.setattr(github_backup, "download_file", fake_download_file) + + github_backup.backup_releases( + args, + tmp_path, + repository, + "https://api.github.com/repos", + include_assets=True, + ) + + assert calls == ["https://api.github.com/repos/owner/repo/releases"] + assert downloads == [ + ( + "https://api.github.com/repos/owner/repo/releases/assets/1", + str(tmp_path / "releases" / "v1.0.0" / "artifact.zip"), + ) + ] + + +def test_backup_releases_falls_back_to_assets_url_when_assets_missing( + create_args, tmp_path, monkeypatch +): + args = create_args(include_releases=True, include_assets=True) + repository = {"full_name": "owner/repo", "name": "repo"} + calls = [] + + def fake_retrieve_data(passed_args, template, query_args=None, paginated=True, **kwargs): + calls.append(template) + if template == "https://api.github.com/repos/owner/repo/releases": + return [ + { + "tag_name": "v1.0.0", + "created_at": "2026-01-01T00:00:00Z", + "updated_at": "2026-01-01T00:00:00Z", + "prerelease": False, + "draft": False, + "assets_url": "https://api.github.com/repos/owner/repo/releases/1/assets", + } + ] + if template == "https://api.github.com/repos/owner/repo/releases/1/assets": + return [] + raise AssertionError("Unexpected API request: {0}".format(template)) + + monkeypatch.setattr(github_backup, "retrieve_data", fake_retrieve_data) + + github_backup.backup_releases( + args, + tmp_path, + repository, + "https://api.github.com/repos", + include_assets=True, + ) + + assert calls == [ + "https://api.github.com/repos/owner/repo/releases", + "https://api.github.com/repos/owner/repo/releases/1/assets", + ] diff --git a/tests/test_retrieve_data.py b/tests/test_retrieve_data.py index adb1152..51848ef 100644 --- a/tests/test_retrieve_data.py +++ b/tests/test_retrieve_data.py @@ -1,6 +1,7 @@ """Tests for retrieve_data function.""" import json +import logging import socket from unittest.mock import Mock, patch from urllib.error import HTTPError, URLError @@ -9,26 +10,27 @@ from github_backup import github_backup from github_backup.github_backup import ( - MAX_RETRIES, calculate_retry_delay, make_request_with_retry, ) +# Default retry count used in tests (matches argparse default) +# With max_retries=5, total attempts = 6 (1 initial + 5 retries) +DEFAULT_MAX_RETRIES = 5 + class TestCalculateRetryDelay: def test_respects_retry_after_header(self): - headers = {'retry-after': '30'} + headers = {"retry-after": "30"} assert calculate_retry_delay(0, headers) == 30 def test_respects_rate_limit_reset(self): import time import calendar + # Set reset time 60 seconds in the future future_reset = calendar.timegm(time.gmtime()) + 60 - headers = { - 'x-ratelimit-remaining': '0', - 'x-ratelimit-reset': str(future_reset) - } + headers = {"x-ratelimit-remaining": "0", "x-ratelimit-reset": str(future_reset)} delay = calculate_retry_delay(0, headers) # Should be approximately 60 seconds (with some tolerance for execution time) assert 55 <= delay <= 65 @@ -50,12 +52,10 @@ def test_max_delay_cap(self): def test_minimum_rate_limit_delay(self): import time import calendar + # Set reset time in the past (already reset) past_reset = calendar.timegm(time.gmtime()) - 100 - headers = { - 'x-ratelimit-remaining': '0', - 'x-ratelimit-reset': str(past_reset) - } + headers = {"x-ratelimit-remaining": "0", "x-ratelimit-reset": str(past_reset)} delay = calculate_retry_delay(0, headers) # Should be minimum 10 seconds even if reset time is in past assert delay >= 10 @@ -64,20 +64,9 @@ def test_minimum_rate_limit_delay(self): class TestRetrieveDataRetry: """Tests for retry behavior in retrieve_data.""" - @pytest.fixture - def mock_args(self): - args = Mock() - args.as_app = False - args.token_fine = None - args.token_classic = "fake_token" - args.osx_keychain_item_name = None - args.osx_keychain_item_account = None - args.throttle_limit = None - args.throttle_pause = 0 - return args - - def test_json_parse_error_retries_and_fails(self, mock_args): + def test_json_parse_error_retries_and_fails(self, create_args): """HTTP 200 with invalid JSON should retry and eventually fail.""" + args = create_args(token_classic="fake_token") mock_response = Mock() mock_response.getcode.return_value = 200 mock_response.read.return_value = b"not valid json {" @@ -85,21 +74,31 @@ def test_json_parse_error_retries_and_fails(self, mock_args): call_count = 0 - def mock_make_request(*args, **kwargs): + def mock_make_request(*a, **kw): nonlocal call_count call_count += 1 return mock_response - with patch("github_backup.github_backup.make_request_with_retry", side_effect=mock_make_request): - with patch("github_backup.github_backup.calculate_retry_delay", return_value=0): # No delay in tests + with patch( + "github_backup.github_backup.make_request_with_retry", + side_effect=mock_make_request, + ): + with patch( + "github_backup.github_backup.calculate_retry_delay", return_value=0 + ): # No delay in tests with pytest.raises(Exception) as exc_info: - github_backup.retrieve_data(mock_args, "https://api.github.com/repos/test/repo/issues") + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/repo/issues" + ) assert "Failed to read response after" in str(exc_info.value) - assert call_count == MAX_RETRIES + assert ( + call_count == DEFAULT_MAX_RETRIES + 1 + ) # 1 initial + 5 retries = 6 attempts - def test_json_parse_error_recovers_on_retry(self, mock_args): + def test_json_parse_error_recovers_on_retry(self, create_args): """HTTP 200 with invalid JSON should succeed if retry returns valid JSON.""" + args = create_args(token_classic="fake_token") bad_response = Mock() bad_response.getcode.return_value = 200 bad_response.read.return_value = b"not valid json {" @@ -113,32 +112,47 @@ def test_json_parse_error_recovers_on_retry(self, mock_args): responses = [bad_response, bad_response, good_response] call_count = 0 - def mock_make_request(*args, **kwargs): + def mock_make_request(*a, **kw): nonlocal call_count result = responses[call_count] call_count += 1 return result - with patch("github_backup.github_backup.make_request_with_retry", side_effect=mock_make_request): - with patch("github_backup.github_backup.calculate_retry_delay", return_value=0): - result = github_backup.retrieve_data(mock_args, "https://api.github.com/repos/test/repo/issues") + with patch( + "github_backup.github_backup.make_request_with_retry", + side_effect=mock_make_request, + ): + with patch( + "github_backup.github_backup.calculate_retry_delay", return_value=0 + ): + result = github_backup.retrieve_data( + args, "https://api.github.com/repos/test/repo/issues" + ) assert result == [{"id": 1}] assert call_count == 3 # Failed twice, succeeded on third - def test_http_error_raises_exception(self, mock_args): + def test_http_error_raises_exception(self, create_args): """Non-success HTTP status codes should raise Exception.""" + args = create_args(token_classic="fake_token") mock_response = Mock() mock_response.getcode.return_value = 404 mock_response.read.return_value = b'{"message": "Not Found"}' mock_response.headers = {"x-ratelimit-remaining": "5000"} mock_response.reason = "Not Found" - with patch("github_backup.github_backup.make_request_with_retry", return_value=mock_response): + with patch( + "github_backup.github_backup.make_request_with_retry", + return_value=mock_response, + ): with pytest.raises(Exception) as exc_info: - github_backup.retrieve_data(mock_args, "https://api.github.com/repos/test/notfound/issues") + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/notfound/issues" + ) - assert not isinstance(exc_info.value, github_backup.RepositoryUnavailableError) + assert not isinstance( + exc_info.value, github_backup.RepositoryUnavailableError + ) assert "404" in str(exc_info.value) @@ -151,7 +165,7 @@ def test_502_error_retries_and_succeeds(self): good_response.read.return_value = b'{"ok": true}' call_count = 0 - fail_count = MAX_RETRIES - 1 # Fail all but last attempt + fail_count = DEFAULT_MAX_RETRIES # Fail all retries, succeed on last attempt def mock_urlopen(*args, **kwargs): nonlocal call_count @@ -167,14 +181,18 @@ def mock_urlopen(*args, **kwargs): return good_response with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): - with patch("github_backup.github_backup.calculate_retry_delay", return_value=0): + with patch( + "github_backup.github_backup.calculate_retry_delay", return_value=0 + ): result = make_request_with_retry(Mock(), None) assert result == good_response - assert call_count == MAX_RETRIES + assert ( + call_count == DEFAULT_MAX_RETRIES + 1 + ) # 1 initial + 5 retries = 6 attempts def test_503_error_retries_until_exhausted(self): - """HTTP 503 should retry MAX_RETRIES times then raise.""" + """HTTP 503 should make 1 initial + DEFAULT_MAX_RETRIES retry attempts then raise.""" call_count = 0 def mock_urlopen(*args, **kwargs): @@ -189,12 +207,16 @@ def mock_urlopen(*args, **kwargs): ) with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): - with patch("github_backup.github_backup.calculate_retry_delay", return_value=0): + with patch( + "github_backup.github_backup.calculate_retry_delay", return_value=0 + ): with pytest.raises(HTTPError) as exc_info: make_request_with_retry(Mock(), None) assert exc_info.value.code == 503 - assert call_count == MAX_RETRIES + assert ( + call_count == DEFAULT_MAX_RETRIES + 1 + ) # 1 initial + 5 retries = 6 attempts def test_404_error_not_retried(self): """HTTP 404 should not be retried - raise immediately.""" @@ -237,7 +259,9 @@ def mock_urlopen(*args, **kwargs): return good_response with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): - with patch("github_backup.github_backup.calculate_retry_delay", return_value=0): + with patch( + "github_backup.github_backup.calculate_retry_delay", return_value=0 + ): result = make_request_with_retry(Mock(), None) assert result == good_response @@ -265,11 +289,33 @@ def mock_urlopen(*args, **kwargs): assert exc_info.value.code == 403 assert call_count == 1 # No retries + def test_451_error_not_retried(self): + """HTTP 451 should not be retried - raise immediately.""" + call_count = 0 + + def mock_urlopen(*args, **kwargs): + nonlocal call_count + call_count += 1 + raise HTTPError( + url="https://api.github.com/test", + code=451, + msg="Unavailable For Legal Reasons", + hdrs={"x-ratelimit-remaining": "5000"}, + fp=None, + ) + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + with pytest.raises(HTTPError) as exc_info: + make_request_with_retry(Mock(), None) + + assert exc_info.value.code == 451 + assert call_count == 1 # No retries + def test_connection_error_retries_and_succeeds(self): """URLError (connection error) should retry and succeed if subsequent request works.""" good_response = Mock() call_count = 0 - fail_count = MAX_RETRIES - 1 # Fail all but last attempt + fail_count = DEFAULT_MAX_RETRIES # Fail all retries, succeed on last attempt def mock_urlopen(*args, **kwargs): nonlocal call_count @@ -279,14 +325,18 @@ def mock_urlopen(*args, **kwargs): return good_response with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): - with patch("github_backup.github_backup.calculate_retry_delay", return_value=0): + with patch( + "github_backup.github_backup.calculate_retry_delay", return_value=0 + ): result = make_request_with_retry(Mock(), None) assert result == good_response - assert call_count == MAX_RETRIES + assert ( + call_count == DEFAULT_MAX_RETRIES + 1 + ) # 1 initial + 5 retries = 6 attempts def test_socket_error_retries_until_exhausted(self): - """socket.error should retry MAX_RETRIES times then raise.""" + """socket.error should make 1 initial + DEFAULT_MAX_RETRIES retry attempts then raise.""" call_count = 0 def mock_urlopen(*args, **kwargs): @@ -295,38 +345,70 @@ def mock_urlopen(*args, **kwargs): raise socket.error("Connection reset by peer") with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): - with patch("github_backup.github_backup.calculate_retry_delay", return_value=0): + with patch( + "github_backup.github_backup.calculate_retry_delay", return_value=0 + ): with pytest.raises(socket.error): make_request_with_retry(Mock(), None) - assert call_count == MAX_RETRIES + assert ( + call_count == DEFAULT_MAX_RETRIES + 1 + ) # 1 initial + 5 retries = 6 attempts + + +class TestRetrieveGraphqlDataLogging: + """Tests for GraphQL request logging.""" + + def test_logs_graphql_context(self, create_args, caplog): + args = create_args(token_classic="fake_token") + mock_response = Mock() + mock_response.getcode.return_value = 200 + mock_response.read.return_value = json.dumps({"data": {}}).encode("utf-8") + mock_response.headers = {"x-ratelimit-remaining": "5000"} + + caplog.set_level(logging.INFO, logger="github_backup.github_backup") + with patch( + "github_backup.github_backup.make_request_with_retry", + return_value=mock_response, + ): + github_backup.retrieve_graphql_data( + args, + "query { viewer { login } }", + log_context="discussion owner/repo#1", + ) + + assert ( + "Requesting https://api.github.com/graphql (discussion owner/repo#1)" + in caplog.text + ) class TestRetrieveDataThrottling: """Tests for throttling behavior in retrieve_data.""" - @pytest.fixture - def mock_args(self): - args = Mock() - args.as_app = False - args.token_fine = None - args.token_classic = "fake_token" - args.osx_keychain_item_name = None - args.osx_keychain_item_account = None - args.throttle_limit = 10 # Throttle when remaining <= 10 - args.throttle_pause = 5 # Pause 5 seconds - return args - - def test_throttling_pauses_when_rate_limit_low(self, mock_args): + def test_throttling_pauses_when_rate_limit_low(self, create_args): """Should pause when x-ratelimit-remaining is at or below throttle_limit.""" + args = create_args( + token_classic="fake_token", + throttle_limit=10, + throttle_pause=5, + ) mock_response = Mock() mock_response.getcode.return_value = 200 mock_response.read.return_value = json.dumps([{"id": 1}]).encode("utf-8") - mock_response.headers = {"x-ratelimit-remaining": "5", "Link": ""} # Below throttle_limit - - with patch("github_backup.github_backup.make_request_with_retry", return_value=mock_response): + mock_response.headers = { + "x-ratelimit-remaining": "5", + "Link": "", + } # Below throttle_limit + + with patch( + "github_backup.github_backup.make_request_with_retry", + return_value=mock_response, + ): with patch("github_backup.github_backup.time.sleep") as mock_sleep: - github_backup.retrieve_data(mock_args, "https://api.github.com/repos/test/repo/issues") + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/repo/issues" + ) mock_sleep.assert_called_once_with(5) # throttle_pause value @@ -334,26 +416,114 @@ def test_throttling_pauses_when_rate_limit_low(self, mock_args): class TestRetrieveDataSingleItem: """Tests for single item (dict) responses in retrieve_data.""" - @pytest.fixture - def mock_args(self): - args = Mock() - args.as_app = False - args.token_fine = None - args.token_classic = "fake_token" - args.osx_keychain_item_name = None - args.osx_keychain_item_account = None - args.throttle_limit = None - args.throttle_pause = 0 - return args - - def test_dict_response_returned_as_list(self, mock_args): + def test_dict_response_returned_as_list(self, create_args): """Single dict response should be returned as a list with one item.""" + args = create_args(token_classic="fake_token") mock_response = Mock() mock_response.getcode.return_value = 200 - mock_response.read.return_value = json.dumps({"login": "testuser", "id": 123}).encode("utf-8") + mock_response.read.return_value = json.dumps( + {"login": "testuser", "id": 123} + ).encode("utf-8") mock_response.headers = {"x-ratelimit-remaining": "5000", "Link": ""} - with patch("github_backup.github_backup.make_request_with_retry", return_value=mock_response): - result = github_backup.retrieve_data(mock_args, "https://api.github.com/user") + with patch( + "github_backup.github_backup.make_request_with_retry", + return_value=mock_response, + ): + result = github_backup.retrieve_data( + args, "https://api.github.com/user" + ) assert result == [{"login": "testuser", "id": 123}] + + +class TestRetriesCliArgument: + """Tests for --retries CLI argument validation and behavior.""" + + def test_retries_argument_accepted(self): + """--retries flag should be accepted and parsed correctly.""" + args = github_backup.parse_args(["--retries", "3", "testuser"]) + assert args.max_retries == 3 + + def test_retries_default_value(self): + """--retries should default to 5 if not specified.""" + args = github_backup.parse_args(["testuser"]) + assert args.max_retries == 5 + + def test_retries_zero_is_valid(self): + """--retries 0 should be valid and mean 1 attempt (no retries).""" + args = github_backup.parse_args(["--retries", "0", "testuser"]) + assert args.max_retries == 0 + + def test_retries_negative_rejected(self): + """--retries with negative value should be rejected by argparse.""" + with pytest.raises(SystemExit): + github_backup.parse_args(["--retries", "-1", "testuser"]) + + def test_retries_non_integer_rejected(self): + """--retries with non-integer value should be rejected by argparse.""" + with pytest.raises(SystemExit): + github_backup.parse_args(["--retries", "abc", "testuser"]) + + def test_retries_one_with_transient_error_succeeds(self): + """--retries 1 should allow one retry after initial failure.""" + good_response = Mock() + good_response.read.return_value = b'{"ok": true}' + + call_count = 0 + + def mock_urlopen(*args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + raise HTTPError( + url="https://api.github.com/test", + code=502, + msg="Bad Gateway", + hdrs={"x-ratelimit-remaining": "5000"}, + fp=None, + ) + return good_response + + with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): + with patch( + "github_backup.github_backup.calculate_retry_delay", return_value=0 + ): + result = make_request_with_retry(Mock(), None, max_retries=1) + + assert result == good_response + assert call_count == 2 # 1 initial + 1 retry = 2 attempts + + def test_custom_retry_count_limits_attempts(self, create_args): + """Custom --retries value should limit actual retry attempts.""" + args = create_args( + token_classic="fake_token", + max_retries=2, # 2 retries = 3 total attempts (1 initial + 2 retries) + ) + + mock_response = Mock() + mock_response.getcode.return_value = 200 + mock_response.read.return_value = b"not valid json {" + mock_response.headers = {"x-ratelimit-remaining": "5000"} + + call_count = 0 + + def mock_make_request(*args, **kwargs): + nonlocal call_count + call_count += 1 + return mock_response + + with patch( + "github_backup.github_backup.make_request_with_retry", + side_effect=mock_make_request, + ): + with patch( + "github_backup.github_backup.calculate_retry_delay", return_value=0 + ): + with pytest.raises(Exception) as exc_info: + github_backup.retrieve_data( + args, "https://api.github.com/repos/test/repo/issues" + ) + + assert "Failed to read response after 3 attempts" in str(exc_info.value) + assert call_count == 3 # 1 initial + 2 retries = 3 attempts diff --git a/tests/test_skip_assets_on.py b/tests/test_skip_assets_on.py index ce28287..519750e 100644 --- a/tests/test_skip_assets_on.py +++ b/tests/test_skip_assets_on.py @@ -1,7 +1,7 @@ """Tests for --skip-assets-on flag behavior (issue #135).""" import pytest -from unittest.mock import Mock, patch +from unittest.mock import patch from github_backup import github_backup @@ -13,52 +13,6 @@ class TestSkipAssetsOn: while still backing up release metadata. """ - def _create_mock_args(self, **overrides): - """Create a mock args object with sensible defaults.""" - args = Mock() - args.user = "testuser" - args.output_directory = "/tmp/backup" - args.include_repository = False - args.include_everything = False - args.include_gists = False - args.include_starred_gists = False - args.all_starred = False - args.skip_existing = False - args.bare_clone = False - args.lfs_clone = False - args.no_prune = False - args.include_wiki = False - args.include_issues = False - args.include_issue_comments = False - args.include_issue_events = False - args.include_pulls = False - args.include_pull_comments = False - args.include_pull_commits = False - args.include_pull_details = False - args.include_labels = False - args.include_hooks = False - args.include_milestones = False - args.include_releases = True - args.include_assets = True - args.skip_assets_on = [] - args.include_attachments = False - args.incremental = False - args.incremental_by_files = False - args.github_host = None - args.prefer_ssh = False - args.token_classic = "test-token" - args.token_fine = None - args.as_app = False - args.osx_keychain_item_name = None - args.osx_keychain_item_account = None - args.skip_prerelease = False - args.number_of_latest_releases = None - - for key, value in overrides.items(): - setattr(args, key, value) - - return args - def _create_mock_repository(self, name="test-repo", owner="testuser"): """Create a mock repository object.""" return { @@ -123,10 +77,10 @@ class TestSkipAssetsOnBehavior(TestSkipAssetsOn): @patch("github_backup.github_backup.mkdir_p") @patch("github_backup.github_backup.json_dump_if_changed") def test_assets_downloaded_when_not_skipped( - self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download + self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download, create_args ): """Assets should be downloaded when repo is not in skip list.""" - args = self._create_mock_args(skip_assets_on=[]) + args = create_args(skip_assets_on=[]) repository = self._create_mock_repository(name="normal-repo") release = self._create_mock_release() asset = self._create_mock_asset() @@ -154,10 +108,10 @@ def test_assets_downloaded_when_not_skipped( @patch("github_backup.github_backup.mkdir_p") @patch("github_backup.github_backup.json_dump_if_changed") def test_assets_skipped_when_repo_name_matches( - self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download + self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download, create_args ): """Assets should be skipped when repo name is in skip list.""" - args = self._create_mock_args(skip_assets_on=["big-repo"]) + args = create_args(skip_assets_on=["big-repo"]) repository = self._create_mock_repository(name="big-repo") release = self._create_mock_release() @@ -180,10 +134,10 @@ def test_assets_skipped_when_repo_name_matches( @patch("github_backup.github_backup.mkdir_p") @patch("github_backup.github_backup.json_dump_if_changed") def test_assets_skipped_when_full_name_matches( - self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download + self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download, create_args ): """Assets should be skipped when owner/repo format matches.""" - args = self._create_mock_args(skip_assets_on=["otheruser/big-repo"]) + args = create_args(skip_assets_on=["otheruser/big-repo"]) repository = self._create_mock_repository(name="big-repo", owner="otheruser") release = self._create_mock_release() @@ -206,11 +160,11 @@ def test_assets_skipped_when_full_name_matches( @patch("github_backup.github_backup.mkdir_p") @patch("github_backup.github_backup.json_dump_if_changed") def test_case_insensitive_matching( - self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download + self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download, create_args ): """Skip matching should be case-insensitive.""" # User types uppercase, repo name is lowercase - args = self._create_mock_args(skip_assets_on=["BIG-REPO"]) + args = create_args(skip_assets_on=["BIG-REPO"]) repository = self._create_mock_repository(name="big-repo") release = self._create_mock_release() @@ -233,10 +187,10 @@ def test_case_insensitive_matching( @patch("github_backup.github_backup.mkdir_p") @patch("github_backup.github_backup.json_dump_if_changed") def test_multiple_skip_repos( - self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download + self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download, create_args ): """Multiple repos in skip list should all be skipped.""" - args = self._create_mock_args(skip_assets_on=["repo1", "repo2", "repo3"]) + args = create_args(skip_assets_on=["repo1", "repo2", "repo3"]) repository = self._create_mock_repository(name="repo2") release = self._create_mock_release() @@ -259,10 +213,10 @@ def test_multiple_skip_repos( @patch("github_backup.github_backup.mkdir_p") @patch("github_backup.github_backup.json_dump_if_changed") def test_release_metadata_still_saved_when_assets_skipped( - self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download + self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download, create_args ): """Release JSON should still be saved even when assets are skipped.""" - args = self._create_mock_args(skip_assets_on=["big-repo"]) + args = create_args(skip_assets_on=["big-repo"]) repository = self._create_mock_repository(name="big-repo") release = self._create_mock_release() @@ -287,10 +241,10 @@ def test_release_metadata_still_saved_when_assets_skipped( @patch("github_backup.github_backup.mkdir_p") @patch("github_backup.github_backup.json_dump_if_changed") def test_non_matching_repo_still_downloads_assets( - self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download + self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download, create_args ): """Repos not in skip list should still download assets.""" - args = self._create_mock_args(skip_assets_on=["other-repo"]) + args = create_args(skip_assets_on=["other-repo"]) repository = self._create_mock_repository(name="normal-repo") release = self._create_mock_release() asset = self._create_mock_asset() diff --git a/tests/test_starred_skip_size_over.py b/tests/test_starred_skip_size_over.py index 2deb72a..250d191 100644 --- a/tests/test_starred_skip_size_over.py +++ b/tests/test_starred_skip_size_over.py @@ -1,39 +1,11 @@ """Tests for --starred-skip-size-over flag behavior (issue #108).""" import pytest -from unittest.mock import Mock from github_backup import github_backup -class TestStarredSkipSizeOver: - """Test suite for --starred-skip-size-over flag. - - Issue #108: Allow restricting size of starred repositories before cloning. - The size is based on the GitHub API's 'size' field (in KB), but the CLI - argument accepts MB for user convenience. - """ - - def _create_mock_args(self, **overrides): - """Create a mock args object with sensible defaults.""" - args = Mock() - args.user = "testuser" - args.repository = None - args.name_regex = None - args.languages = None - args.fork = False - args.private = False - args.skip_archived = False - args.starred_skip_size_over = None - args.exclude = None - - for key, value in overrides.items(): - setattr(args, key, value) - - return args - - -class TestStarredSkipSizeOverArgumentParsing(TestStarredSkipSizeOver): +class TestStarredSkipSizeOverArgumentParsing: """Tests for --starred-skip-size-over argument parsing.""" def test_starred_skip_size_over_not_set_defaults_to_none(self): @@ -52,12 +24,17 @@ def test_starred_skip_size_over_rejects_non_integer(self): github_backup.parse_args(["testuser", "--starred-skip-size-over", "abc"]) -class TestStarredSkipSizeOverFiltering(TestStarredSkipSizeOver): - """Tests for --starred-skip-size-over filtering behavior.""" +class TestStarredSkipSizeOverFiltering: + """Tests for --starred-skip-size-over filtering behavior. + + Issue #108: Allow restricting size of starred repositories before cloning. + The size is based on the GitHub API's 'size' field (in KB), but the CLI + argument accepts MB for user convenience. + """ - def test_starred_repo_under_limit_is_kept(self): + def test_starred_repo_under_limit_is_kept(self, create_args): """Starred repos under the size limit should be kept.""" - args = self._create_mock_args(starred_skip_size_over=500) + args = create_args(starred_skip_size_over=500) repos = [ { @@ -72,9 +49,9 @@ def test_starred_repo_under_limit_is_kept(self): assert len(result) == 1 assert result[0]["name"] == "small-repo" - def test_starred_repo_over_limit_is_filtered(self): + def test_starred_repo_over_limit_is_filtered(self, create_args): """Starred repos over the size limit should be filtered out.""" - args = self._create_mock_args(starred_skip_size_over=500) + args = create_args(starred_skip_size_over=500) repos = [ { @@ -88,9 +65,9 @@ def test_starred_repo_over_limit_is_filtered(self): result = github_backup.filter_repositories(args, repos) assert len(result) == 0 - def test_own_repo_over_limit_is_kept(self): + def test_own_repo_over_limit_is_kept(self, create_args): """User's own repos should not be affected by the size limit.""" - args = self._create_mock_args(starred_skip_size_over=500) + args = create_args(starred_skip_size_over=500) repos = [ { @@ -105,9 +82,9 @@ def test_own_repo_over_limit_is_kept(self): assert len(result) == 1 assert result[0]["name"] == "my-huge-repo" - def test_starred_repo_at_exact_limit_is_kept(self): + def test_starred_repo_at_exact_limit_is_kept(self, create_args): """Starred repos at exactly the size limit should be kept.""" - args = self._create_mock_args(starred_skip_size_over=500) + args = create_args(starred_skip_size_over=500) repos = [ { @@ -122,9 +99,9 @@ def test_starred_repo_at_exact_limit_is_kept(self): assert len(result) == 1 assert result[0]["name"] == "exact-limit-repo" - def test_mixed_repos_filtered_correctly(self): + def test_mixed_repos_filtered_correctly(self, create_args): """Mix of own and starred repos should be filtered correctly.""" - args = self._create_mock_args(starred_skip_size_over=500) + args = create_args(starred_skip_size_over=500) repos = [ { @@ -153,9 +130,9 @@ def test_mixed_repos_filtered_correctly(self): assert "starred-small" in names assert "starred-huge" not in names - def test_no_size_limit_keeps_all_starred(self): + def test_no_size_limit_keeps_all_starred(self, create_args): """When no size limit is set, all starred repos should be kept.""" - args = self._create_mock_args(starred_skip_size_over=None) + args = create_args(starred_skip_size_over=None) repos = [ { @@ -169,9 +146,9 @@ def test_no_size_limit_keeps_all_starred(self): result = github_backup.filter_repositories(args, repos) assert len(result) == 1 - def test_repo_without_size_field_is_kept(self): + def test_repo_without_size_field_is_kept(self, create_args): """Repos without a size field should be kept (size defaults to 0).""" - args = self._create_mock_args(starred_skip_size_over=500) + args = create_args(starred_skip_size_over=500) repos = [ { @@ -185,9 +162,9 @@ def test_repo_without_size_field_is_kept(self): result = github_backup.filter_repositories(args, repos) assert len(result) == 1 - def test_zero_value_warns_and_is_ignored(self, caplog): + def test_zero_value_warns_and_is_ignored(self, create_args, caplog): """Zero value should warn and keep all repos.""" - args = self._create_mock_args(starred_skip_size_over=0) + args = create_args(starred_skip_size_over=0) repos = [ { @@ -202,9 +179,9 @@ def test_zero_value_warns_and_is_ignored(self, caplog): assert len(result) == 1 assert "must be greater than 0" in caplog.text - def test_negative_value_warns_and_is_ignored(self, caplog): + def test_negative_value_warns_and_is_ignored(self, create_args, caplog): """Negative value should warn and keep all repos.""" - args = self._create_mock_args(starred_skip_size_over=-5) + args = create_args(starred_skip_size_over=-5) repos = [ {