[{"absolute_url": "https://datasets.softwareheritage.org/datasets/2026-03-02-compressed/", "anchor_url": "/graphs/compressed/#2026-03-02-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": null, "date": "2026-03-02", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2026-03-02/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2026-03-02-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2026-03-02/compressed/ 2026-03-02-compressed", "s3_url": "s3://softwareheritage/graph/2026-03-02/compressed/", "size": "13 TiB", "slug": "2026-03-02", "swh_download_command": "swh datasets download-graph 2026-03-02", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2026-03-02-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2026-03-02-orc/", "anchor_url": "/graphs/columnar/#2026-03-02-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2026-03-02", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2026-03-02/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2026-03-02-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2026-03-02/orc/ 2026-03-02-orc", "s3_url": "s3://softwareheritage/graph/2026-03-02/orc/", "size": "32 TiB", "slug": "2026-03-02", "swh_download_command": "swh datasets download-export 2026-03-02", "tables": null, "tags": ["orc"], "teaser_of": null, "title": "Graph export in columnar tables", "type": "orc", "url_safe_id": "2026-03-02-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2026-03-02-history-hosting-compressed/", "anchor_url": "/graphs/compressed/#2026-03-02-history-hosting-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "This is a compressed graph of only the \"history and hosting\" layer (origins, snapshots, releases, revisions) and the root directory (or rarely content) of every revision/release; but most directories and contents are excluded. Properties on the root directories (or content) other than its SWHID are omitted as well.\n", "date": "2026-03-02", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2026-03-02/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2026-03-02-history-hosting-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2026-03-02-history-hosting/compressed/ 2026-03-02-history-hosting-compressed", "s3_url": "s3://softwareheritage/graph/2026-03-02-history-hosting/compressed/", "size": "1 TiB", "slug": "2026-03-02-history-hosting", "swh_download_command": "swh datasets download-graph 2026-03-02-history-hosting", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "History and hosting Compressed graph", "type": "compressed", "url_safe_id": "2026-03-02-history-hosting-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-10-08-compressed/", "anchor_url": "/graphs/compressed/#2025-10-08-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": null, "date": "2025-10-08", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2025-10-08/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-10-08-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2025-10-08/compressed/ 2025-10-08-compressed", "s3_url": "s3://softwareheritage/graph/2025-10-08/compressed/", "size": "15 TiB", "slug": "2025-10-08", "swh_download_command": "swh datasets download-graph 2025-10-08", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2025-10-08-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-10-08-orc/", "anchor_url": "/graphs/columnar/#2025-10-08-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2025-10-08", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2025-10-08/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-10-08-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2025-10-08/orc/ 2025-10-08-orc", "s3_url": "s3://softwareheritage/graph/2025-10-08/orc/", "size": "30 TiB", "slug": "2025-10-08", "swh_download_command": "swh datasets download-export 2025-10-08", "tables": null, "tags": ["orc"], "teaser_of": null, "title": "Graph export in columnar tables", "type": "orc", "url_safe_id": "2025-10-08-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-10-08-contents/", "anchor_url": "/derived/contents/#2025-10-08-contents", "annex_url": null, "category": "contents", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2025-10-08", "deprecated": false, "derived_of": "2025-10-08-compressed", "description": "Precomputed information on the graph's content nodes: most popular name, id and date of the oldest revision to contain them, and a sample origin", "export_url": "/exports/2025-10-08/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-10-08-contents", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2025-10-08/contents/ 2025-10-08-contents", "s3_url": "s3://softwareheritage/derived_datasets/2025-10-08/contents/", "size": null, "slug": "2025-10-08", "swh_download_command": null, "tables": null, "tags": ["contents", "derived"], "teaser_of": null, "title": "Aggregated Contents", "type": "contents", "url_safe_id": "2025-10-08-contents", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-10-08-history-hosting-compressed/", "anchor_url": "/graphs/compressed/#2025-10-08-history-hosting-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "This is a compressed graph of only the \"history and hosting\" layer (origins, snapshots, releases, revisions) and the root directory (or rarely content) of every revision/release; but most directories and contents are excluded. Properties on the root directories (or content) other than its SWHID are omitted as well.\n", "date": "2025-10-08", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2025-10-08/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-10-08-history-hosting-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2025-10-08-history-hosting/compressed/ 2025-10-08-history-hosting-compressed", "s3_url": "s3://softwareheritage/graph/2025-10-08-history-hosting/compressed/", "size": "841 GiB", "slug": "2025-10-08-history-hosting", "swh_download_command": "swh datasets download-graph 2025-10-08-history-hosting", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "History and hosting Compressed graph", "type": "compressed", "url_safe_id": "2025-10-08-history-hosting-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-10-08-topology/", "anchor_url": "/derived/topology/#2025-10-08-topology", "annex_url": null, "category": "topology", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2025-10-08", "deprecated": false, "derived_of": "2025-10-08-compressed", "description": "Precomputed topological order (in two formats: CSV and bitstream) and generation numbers (as a plain array). The <a href=\"https://docs.rs/swh_graph_topology/\">swh_graph_topology crate</a> is needed to read the topological order bitstream, but is significantly faster than CSV and allows some parallel processing.", "export_url": "/exports/2025-10-08/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-10-08-topology", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2025-10-08/topology/ 2025-10-08-topology", "s3_url": "s3://softwareheritage/derived_datasets/2025-10-08/topology/", "size": null, "slug": "2025-10-08", "swh_download_command": null, "tables": null, "tags": ["topology", "derived"], "teaser_of": null, "title": "Topology", "type": "topology", "url_safe_id": "2025-10-08-topology", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-05-18-compressed/", "anchor_url": "/graphs/compressed/#2025-05-18-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": null, "date": "2025-05-18", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2025-05-18/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-05-18-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2025-05-18/compressed/ 2025-05-18-compressed", "s3_url": "s3://softwareheritage/graph/2025-05-18/compressed/", "size": "14 TiB", "slug": "2025-05-18", "swh_download_command": "swh datasets download-graph 2025-05-18", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2025-05-18-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-05-18-orc/", "anchor_url": "/graphs/columnar/#2025-05-18-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2025-05-18", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2025-05-18/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-05-18-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2025-05-18/orc/ 2025-05-18-orc", "s3_url": "s3://softwareheritage/graph/2025-05-18/orc/", "size": "27 TiB", "slug": "2025-05-18", "swh_download_command": "swh datasets download-export 2025-05-18", "tables": null, "tags": ["orc"], "teaser_of": null, "title": "Graph export in columnar tables", "type": "orc", "url_safe_id": "2025-05-18-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-05-18-contents/", "anchor_url": "/derived/contents/#2025-05-18-contents", "annex_url": null, "category": "contents", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2025-05-18", "deprecated": false, "derived_of": "2025-05-18-compressed", "description": "Precomputed information on the graph's content nodes: most popular name, id and date of the oldest revision to contain them, and a sample origin", "export_url": "/exports/2025-05-18/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-05-18-contents", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2025-05-18/contents/ 2025-05-18-contents", "s3_url": "s3://softwareheritage/derived_datasets/2025-05-18/contents/", "size": null, "slug": "2025-05-18", "swh_download_command": null, "tables": null, "tags": ["contents", "derived"], "teaser_of": null, "title": "Aggregated Contents", "type": "contents", "url_safe_id": "2025-05-18-contents", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-05-18-history-hosting-compressed/", "anchor_url": "/graphs/compressed/#2025-05-18-history-hosting-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "This is a compressed graph of only the \"history and hosting\" layer (origins, snapshots, releases, revisions) and the root directory (or rarely content) of every revision/release; but most directories and contents are excluded. Properties on the root directories (or content) other than its SWHID are omitted as well.\n", "date": "2025-05-18", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2025-05-18/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-05-18-history-hosting-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2025-05-18-history-hosting/compressed/ 2025-05-18-history-hosting-compressed", "s3_url": "s3://softwareheritage/graph/2025-05-18-history-hosting/compressed/", "size": "782 GiB", "slug": "2025-05-18-history-hosting", "swh_download_command": "swh datasets download-graph 2025-05-18-history-hosting", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "History and hosting Compressed graph", "type": "compressed", "url_safe_id": "2025-05-18-history-hosting-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-05-18-topology/", "anchor_url": "/derived/topology/#2025-05-18-topology", "annex_url": null, "category": "topology", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2025-05-18", "deprecated": false, "derived_of": "2025-05-18-compressed", "description": "Precomputed topological order (in two formats: CSV and bitstream) and generation numbers (as a plain array). The <a href=\"https://docs.rs/swh_graph_topology/\">swh_graph_topology crate</a> is needed to read the topological order bitstream, but is significantly faster than CSV and allows some parallel processing.", "export_url": "/exports/2025-05-18/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-05-18-topology", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2025-05-18/topology/ 2025-05-18-topology", "s3_url": "s3://softwareheritage/derived_datasets/2025-05-18/topology/", "size": null, "slug": "2025-05-18", "swh_download_command": null, "tables": null, "tags": ["topology", "derived"], "teaser_of": null, "title": "Topology", "type": "topology", "url_safe_id": "2025-05-18-topology", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-05-18-popular-1k-compressed/", "anchor_url": "/teasers/compressed/#2025-05-18-popular-1k-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "This is a subgraph of the 2025-05-18 export, filtered by rooting from 1000 popular origins:\n\n- 900 among the most starred GitHub repositories (as of July 1st 2025)\n- 100 among the most frequently installed Debian packages (according to the\n  [Debian Popularity Contest](https://popcon.debian.org/) database published on Sept 3rd 2025).\n", "date": "2025-05-18", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2025-05-18/", "group": "teasers", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-05-18-popular-1k-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2025-05-18-popular-1k/compressed/ 2025-05-18-popular-1k-compressed", "s3_url": "s3://softwareheritage/graph/2025-05-18-popular-1k/compressed/", "size": "202 GB", "slug": "2025-05-18-popular-1k", "swh_download_command": "swh datasets download-graph 2025-05-18-popular-1k", "tables": null, "tags": ["compressed", "teaser"], "teaser_of": "2025-05-18-compressed", "title": "Popular 1k compressed graph", "type": "compressed", "url_safe_id": "2025-05-18-popular-1k-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-05-18-popular-1k-orc/", "anchor_url": "/teasers/columnar/#2025-05-18-popular-1k-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": "This is a subgraph of the 2025-05-18 export, filtered by rooting from 1000 popular origins:\n\n- 900 among the most starred GitHub repositories (as of July 1st 2025)\n- 100 among the most frequently installed Debian packages (according to the\n  [Debian Popularity Contest](https://popcon.debian.org/) database published on Sept 3rd 2025).\n", "date": "2025-05-18", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2025-05-18/", "group": "teasers", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-05-18-popular-1k-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2025-05-18-popular-1k/orc/ 2025-05-18-popular-1k-orc", "s3_url": "s3://softwareheritage/graph/2025-05-18-popular-1k/orc/", "size": "202 GB", "slug": "2025-05-18-popular-1k", "swh_download_command": "swh datasets download-export 2025-05-18-popular-1k", "tables": null, "tags": ["orc", "teaser"], "teaser_of": "2025-05-18-orc", "title": "Popular 1k columnar tables", "type": "orc", "url_safe_id": "2025-05-18-popular-1k-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-05-18-digestmap/", "anchor_url": "/derived/digestmap/#2025-05-18-digestmap", "annex_url": null, "category": "digestmap", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2025-05-18", "deprecated": false, "derived_of": "2025-05-18-compressed", "description": "Efficient mapping of content hashes (from SWHID to SHA1).", "export_url": "/exports/2025-05-18/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-05-18-digestmap", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2025-05-18/digestmap/ 2025-05-18-digestmap", "s3_url": "s3://softwareheritage/derived_datasets/2025-05-18/digestmap/", "size": "1.1 TiB", "slug": "2025-05-18", "swh_download_command": null, "tables": null, "tags": ["digestmap", "derived"], "teaser_of": null, "title": "Digestmap", "type": "digestmap", "url_safe_id": "2025-05-18-digestmap", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-05-18-popular-1k-digestmap/", "anchor_url": "/derived/digestmap/#2025-05-18-popular-1k-digestmap", "annex_url": null, "category": "digestmap", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2025-05-18", "deprecated": false, "derived_of": "2025-05-18-popular-1k-compressed", "description": "Efficient mapping of content hashes (from SWHID to SHA1).", "export_url": "/exports/2025-05-18/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2025-05-18-popular-1k-digestmap", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2025-05-18-popular-1k/digestmap/ 2025-05-18-popular-1k-digestmap", "s3_url": "s3://softwareheritage/derived_datasets/2025-05-18-popular-1k/digestmap/", "size": "4.8 GiB", "slug": "2025-05-18-popular-1k", "swh_download_command": null, "tables": null, "tags": ["digestmap", "derived"], "teaser_of": null, "title": "Digestmap", "type": "digestmap", "url_safe_id": "2025-05-18-popular-1k-digestmap", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2025-05-18-license/", "anchor_url": "/derived/license/#2025-05-18-license", "annex_url": "https://annex.softwareheritage.org/public/dataset/license-blobs/2025-05-18/", "category": "license", "citations": ["https://datasets.softwareheritage.org/Zacchiroli22.bib", "https://datasets.softwareheritage.org/GonzalezBarahonaLRZ23.bib"], "comments": null, "date": "2025-05-18", "deprecated": false, "derived_of": "2025-05-18-compressed", "description": "Dataset of the complete texts of free/open source software (FOSS) license variants.", "export_url": "/exports/2025-05-18/", "group": "derived", "hosted_on_annex": true, "hosted_on_s3": false, "id": "2025-05-18-license", "license": "CC-BY-4.0", "s3_download_command": null, "s3_url": null, "size": "30 GiB", "slug": "2025-05-18", "swh_download_command": null, "tables": null, "tags": ["license", "derived"], "teaser_of": null, "title": "License blobs", "type": "license", "url_safe_id": "2025-05-18-license", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-12-06-compressed/", "anchor_url": "/graphs/compressed/#2024-12-06-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": null, "date": "2024-12-06", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2024-12-06/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-12-06-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2024-12-06/compressed/ 2024-12-06-compressed", "s3_url": "s3://softwareheritage/graph/2024-12-06/compressed/", "size": "12 TiB", "slug": "2024-12-06", "swh_download_command": "swh datasets download-graph 2024-12-06", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2024-12-06-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-12-06-orc/", "anchor_url": "/graphs/columnar/#2024-12-06-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2024-12-06", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2024-12-06/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-12-06-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2024-12-06/orc/ 2024-12-06-orc", "s3_url": "s3://softwareheritage/graph/2024-12-06/orc/", "size": "23 TiB", "slug": "2024-12-06", "swh_download_command": "swh datasets download-export 2024-12-06", "tables": null, "tags": ["orc"], "teaser_of": null, "title": "Graph export in columnar tables", "type": "orc", "url_safe_id": "2024-12-06-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-12-06-contents/", "anchor_url": "/derived/contents/#2024-12-06-contents", "annex_url": null, "category": "contents", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2024-12-06", "deprecated": false, "derived_of": "2024-12-06-compressed", "description": "Precomputed information on the graph's content nodes: most popular name, id and date of the oldest revision to contain them, and a sample origin", "export_url": "/exports/2024-12-06/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-12-06-contents", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2024-12-06/contents/ 2024-12-06-contents", "s3_url": "s3://softwareheritage/derived_datasets/2024-12-06/contents/", "size": null, "slug": "2024-12-06", "swh_download_command": null, "tables": null, "tags": ["contents", "derived"], "teaser_of": null, "title": "Aggregated Contents", "type": "contents", "url_safe_id": "2024-12-06-contents", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-12-06-provenance_all/", "anchor_url": "/derived/provenance/#2024-12-06-provenance/all", "annex_url": null, "category": "provenance", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2024-12-06", "deprecated": false, "derived_of": "2024-12-06-compressed", "description": "Precomputed tables of all revisions a content is in, and all origins a revision is in, indexed for reasonably efficient backward queries. Its current implementation is primarily a Parquet database, along with some external indexes for more efficient access. The swh-provenance Rust crate provides access to these indexes and a gRPC server to query the data remotely.", "export_url": "/exports/2024-12-06/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-12-06-provenance/all", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2024-12-06/provenance/all/ 2024-12-06-provenance_all", "s3_url": "s3://softwareheritage/derived_datasets/2024-12-06/provenance/all/", "size": null, "slug": "2024-12-06", "swh_download_command": null, "tables": null, "tags": ["provenance/all", "derived"], "teaser_of": null, "title": "Provenance (all releases and revisions)", "type": "provenance/all", "url_safe_id": "2024-12-06-provenance_all", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-12-06-provenance_heads/", "anchor_url": "/derived/provenance/#2024-12-06-provenance/heads", "annex_url": null, "category": "provenance", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2024-12-06", "deprecated": false, "derived_of": "2024-12-06-compressed", "description": "Precomputed tables of all revisions a content is in, and all origins a revision is in, indexed for reasonably efficient backward queries. Its current implementation is primarily a Parquet database, along with some external indexes for more efficient access. The swh-provenance Rust crate provides access to these indexes and a gRPC server to query the data remotely.All revisions that are not directly pointed by a snapshot or release (ie. not branch heads or tagged revisions) are omitted.", "export_url": "/exports/2024-12-06/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-12-06-provenance/heads", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2024-12-06/provenance/heads/ 2024-12-06-provenance_heads", "s3_url": "s3://softwareheritage/derived_datasets/2024-12-06/provenance/heads/", "size": null, "slug": "2024-12-06", "swh_download_command": null, "tables": null, "tags": ["provenance/heads", "derived"], "teaser_of": null, "title": "Provenance (releases and head revisions)", "type": "provenance/heads", "url_safe_id": "2024-12-06-provenance_heads", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-12-06-topology/", "anchor_url": "/derived/topology/#2024-12-06-topology", "annex_url": null, "category": "topology", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2024-12-06", "deprecated": false, "derived_of": "2024-12-06-compressed", "description": "Precomputed topological order (in two formats: CSV and bitstream) and generation numbers (as a plain array). The <a href=\"https://docs.rs/swh_graph_topology/\">swh_graph_topology crate</a> is needed to read the topological order bitstream, but is significantly faster than CSV and allows some parallel processing.", "export_url": "/exports/2024-12-06/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-12-06-topology", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2024-12-06/topology/ 2024-12-06-topology", "s3_url": "s3://softwareheritage/derived_datasets/2024-12-06/topology/", "size": null, "slug": "2024-12-06", "swh_download_command": null, "tables": null, "tags": ["topology", "derived"], "teaser_of": null, "title": "Topology", "type": "topology", "url_safe_id": "2024-12-06-topology", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-12-06-digestmap/", "anchor_url": "/derived/digestmap/#2024-12-06-digestmap", "annex_url": null, "category": "digestmap", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2024-12-06", "deprecated": false, "derived_of": "2024-12-06-compressed", "description": "Efficient mapping of content hashes (from SWHID to SHA1).", "export_url": "/exports/2024-12-06/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-12-06-digestmap", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2024-12-06/digestmap/ 2024-12-06-digestmap", "s3_url": "s3://softwareheritage/derived_datasets/2024-12-06/digestmap/", "size": "917 GiB", "slug": "2024-12-06", "swh_download_command": null, "tables": null, "tags": ["digestmap", "derived"], "teaser_of": null, "title": "Digestmap", "type": "digestmap", "url_safe_id": "2024-12-06-digestmap", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-12-06-license/", "anchor_url": "/derived/license/#2024-12-06-license", "annex_url": "https://annex.softwareheritage.org/public/dataset/license-blobs/2024-12-06/", "category": "license", "citations": ["https://datasets.softwareheritage.org/Zacchiroli22.bib", "https://datasets.softwareheritage.org/GonzalezBarahonaLRZ23.bib"], "comments": null, "date": "2024-12-06", "deprecated": false, "derived_of": "2024-12-06-compressed", "description": "Dataset of the complete texts of free/open source software (FOSS) license variants.", "export_url": "/exports/2024-12-06/", "group": "derived", "hosted_on_annex": true, "hosted_on_s3": false, "id": "2024-12-06-license", "license": "CC-BY-4.0", "s3_download_command": null, "s3_url": null, "size": "27.6 GiB", "slug": "2024-12-06", "swh_download_command": null, "tables": null, "tags": ["license", "derived"], "teaser_of": null, "title": "License blobs", "type": "license", "url_safe_id": "2024-12-06-license", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-08-23-compressed/", "anchor_url": "/graphs/compressed/#2024-08-23-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "This graph changed the MPH from GOV/Cmph to PTHash; Rust code hardcoding GOVMPH needs to replace it with DynMph or SwhidPthash. Java is no longer supported to read this graph.\n", "date": "2024-08-23", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2024-08-23/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-08-23-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2024-08-23/compressed/ 2024-08-23-compressed", "s3_url": "s3://softwareheritage/graph/2024-08-23/compressed/", "size": "11 TiB", "slug": "2024-08-23", "swh_download_command": "swh datasets download-graph 2024-08-23", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2024-08-23-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-08-23-orc/", "anchor_url": "/graphs/columnar/#2024-08-23-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2024-08-23", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2024-08-23/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-08-23-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2024-08-23/orc/ 2024-08-23-orc", "s3_url": "s3://softwareheritage/graph/2024-08-23/orc/", "size": "19 TiB", "slug": "2024-08-23", "swh_download_command": "swh datasets download-export 2024-08-23", "tables": null, "tags": ["orc"], "teaser_of": null, "title": "Graph export in columnar tables", "type": "orc", "url_safe_id": "2024-08-23-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-08-23-popular-500-python-compressed/", "anchor_url": "/teasers/compressed/#2024-08-23-popular-500-python-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "This teaser contains a subset of the 443 repositories archived by Software Heritage as of 2024-08-23, among the 700 GitHub repositories tagged as being written in Python with the most stars.\n", "date": "2024-08-23", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2024-08-23/", "group": "teasers", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-08-23-popular-500-python-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2024-08-23-popular-500-python/compressed/ 2024-08-23-popular-500-python-compressed", "s3_url": "s3://softwareheritage/graph/2024-08-23-popular-500-python/compressed/", "size": "15 GB", "slug": "2024-08-23-popular-500-python", "swh_download_command": "swh datasets download-graph 2024-08-23-popular-500-python", "tables": null, "tags": ["compressed", "teaser"], "teaser_of": "2024-08-23-compressed", "title": "Popular 500 python compressed graph", "type": "compressed", "url_safe_id": "2024-08-23-popular-500-python-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-08-23-popular-500-python-orc/", "anchor_url": "/teasers/columnar/#2024-08-23-popular-500-python-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": "This teaser contains a subset of the 443 repositories archived by Software Heritage as of 2024-08-23, among the 700 GitHub repositories tagged as being written in Python with the most stars.\n", "date": "2024-08-23", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2024-08-23/", "group": "teasers", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-08-23-popular-500-python-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2024-08-23-popular-500-python/orc/ 2024-08-23-popular-500-python-orc", "s3_url": "s3://softwareheritage/graph/2024-08-23-popular-500-python/orc/", "size": "36 GB", "slug": "2024-08-23-popular-500-python", "swh_download_command": "swh datasets download-export 2024-08-23-popular-500-python", "tables": null, "tags": ["orc", "teaser"], "teaser_of": "2024-08-23-orc", "title": "Popular 500 python columnar tables", "type": "orc", "url_safe_id": "2024-08-23-popular-500-python-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-08-23-popular-500-python-provenance_all/", "anchor_url": "/teasers/provenance/#2024-08-23-popular-500-python-provenance/all", "annex_url": null, "category": "provenance", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": "This teaser contains a subset of the 443 repositories archived by Software Heritage as of 2024-08-23, among the 700 GitHub repositories tagged as being written in Python with the most stars.\n", "date": "2024-08-23", "deprecated": false, "derived_of": null, "description": "Precomputed tables of all revisions a content is in, and all origins a revision is in, indexed for reasonably efficient backward queries. Its current implementation is primarily a Parquet database, along with some external indexes for more efficient access. The swh-provenance Rust crate provides access to these indexes and a gRPC server to query the data remotely.", "export_url": "/exports/2024-08-23/", "group": "teasers", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-08-23-popular-500-python-provenance/all", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2024-08-23-popular-500-python/provenance/all/ 2024-08-23-popular-500-python-provenance_all", "s3_url": "s3://softwareheritage/graph/2024-08-23-popular-500-python/provenance/all/", "size": "46 GB", "slug": "2024-08-23-popular-500-python", "swh_download_command": null, "tables": null, "tags": ["provenance/all", "teaser"], "teaser_of": "2024-08-23-compressed", "title": "Popular 500 python provenance (all releases and revisions)", "type": "provenance/all", "url_safe_id": "2024-08-23-popular-500-python-provenance_all", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-05-16-compressed/", "anchor_url": "/graphs/compressed/#2024-05-16-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": null, "date": "2024-05-16", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2024-05-16/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-05-16-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2024-05-16/compressed/ 2024-05-16-compressed", "s3_url": "s3://softwareheritage/graph/2024-05-16/compressed/", "size": "11 TiB", "slug": "2024-05-16", "swh_download_command": "swh datasets download-graph 2024-05-16", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2024-05-16-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-05-16-orc/", "anchor_url": "/graphs/columnar/#2024-05-16-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2024-05-16", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2024-05-16/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-05-16-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2024-05-16/orc/ 2024-05-16-orc", "s3_url": "s3://softwareheritage/graph/2024-05-16/orc/", "size": "18 TiB", "slug": "2024-05-16", "swh_download_command": "swh datasets download-export 2024-05-16", "tables": null, "tags": ["orc"], "teaser_of": null, "title": "Graph export in columnar tables", "type": "orc", "url_safe_id": "2024-05-16-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2024-05-16-path_counts/", "anchor_url": "/derived/path_counts/#2024-05-16-path_counts", "annex_url": null, "category": "path_counts", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2024-05-16", "deprecated": false, "derived_of": "2024-05-16-compressed", "description": "Associates each node to the number of paths from any node to that node.", "export_url": "/exports/2024-05-16/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2024-05-16-path_counts", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2024-05-16/path_counts_forward_ori,snp,rel,rev,dir,cnt/ 2024-05-16-path_counts", "s3_url": "s3://softwareheritage/derived_datasets/2024-05-16/path_counts_forward_ori,snp,rel,rev,dir,cnt/", "size": null, "slug": "2024-05-16", "swh_download_command": null, "tables": null, "tags": ["path_counts", "derived"], "teaser_of": null, "title": "Path counts", "type": "path_counts", "url_safe_id": "2024-05-16-path_counts", "variant": "forward_ori,snp,rel,rev,dir,cnt"}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2023-09-06-compressed/", "anchor_url": "/graphs/compressed/#2023-09-06-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": null, "date": "2023-09-06", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2023-09-06/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2023-09-06-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2023-09-06/compressed/ 2023-09-06-compressed", "s3_url": "s3://softwareheritage/graph/2023-09-06/compressed/", "size": "11 TiB", "slug": "2023-09-06", "swh_download_command": "swh datasets download-graph 2023-09-06", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2023-09-06-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2023-09-06-orc/", "anchor_url": "/graphs/columnar/#2023-09-06-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2023-09-06", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2023-09-06/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2023-09-06-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2023-09-06/orc/ 2023-09-06-orc", "s3_url": "s3://softwareheritage/graph/2023-09-06/orc/", "size": "18 TiB", "slug": "2023-09-06", "swh_download_command": "swh datasets download-export 2023-09-06", "tables": null, "tags": ["orc"], "teaser_of": null, "title": "Graph export in columnar tables", "type": "orc", "url_safe_id": "2023-09-06-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2023-09-06-popular-1k-compressed/", "anchor_url": "/teasers/compressed/#2023-09-06-popular-1k-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "The popular-1k teaser contains a subset of 1120 popular repositories **tagged as being written in one of the 10 most popular languages** (Javascript, Python, Java, Typescript, C#, C++, PHP, Shell, C, Ruby), from GitHub, GitLab.com, Packagist, PyPI and Debian. The selection criteria to pick the software origins for each language was the following:\n\n- the 50 most popular Gitlab.com projects written in that language that have 2 stars or more,\n- for Python, the 50 most popular PyPI projects (by usage statistics, according to the Top PyPI Packages database),\n- for PHP, the 50 most popular Packagist projects (by usage statistics, according to Packagist's API),\n- the 50 most popular Debian packages with the relevant implemented-in:: debtag (by \"installs\" according to the Debian Popularity Contest database).\n- most popular GitHub projects written in Python (by number of stars), until the total number of origins for that language reaches 200\n- removing origins not archived by Software Heritage by 2023-09-06\n", "date": "2023-09-06", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2023-09-06/", "group": "teasers", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2023-09-06-popular-1k-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2023-09-06-popular-1k/compressed/ 2023-09-06-popular-1k-compressed", "s3_url": "s3://softwareheritage/graph/2023-09-06-popular-1k/compressed/", "size": "42 GB", "slug": "2023-09-06-popular-1k", "swh_download_command": "swh datasets download-graph 2023-09-06-popular-1k", "tables": null, "tags": ["compressed", "teaser"], "teaser_of": "2023-09-06-compressed", "title": "Popular 1k compressed graph", "type": "compressed", "url_safe_id": "2023-09-06-popular-1k-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2023-09-06-popular-1k-orc/", "anchor_url": "/teasers/columnar/#2023-09-06-popular-1k-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": "The popular-1k teaser contains a subset of 1120 popular repositories **tagged as being written in one of the 10 most popular languages** (Javascript, Python, Java, Typescript, C#, C++, PHP, Shell, C, Ruby), from GitHub, GitLab.com, Packagist, PyPI and Debian. The selection criteria to pick the software origins for each language was the following:\n\n- the 50 most popular Gitlab.com projects written in that language that have 2 stars or more,\n- for Python, the 50 most popular PyPI projects (by usage statistics, according to the Top PyPI Packages database),\n- for PHP, the 50 most popular Packagist projects (by usage statistics, according to Packagist's API),\n- the 50 most popular Debian packages with the relevant implemented-in:: debtag (by \"installs\" according to the Debian Popularity Contest database).\n- most popular GitHub projects written in Python (by number of stars), until the total number of origins for that language reaches 200\n- removing origins not archived by Software Heritage by 2023-09-06\n", "date": "2023-09-06", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2023-09-06/", "group": "teasers", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2023-09-06-popular-1k-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2023-09-06-popular-1k/orc/ 2023-09-06-popular-1k-orc", "s3_url": "s3://softwareheritage/graph/2023-09-06-popular-1k/orc/", "size": "280 GB", "slug": "2023-09-06-popular-1k", "swh_download_command": "swh datasets download-export 2023-09-06-popular-1k", "tables": null, "tags": ["orc", "teaser"], "teaser_of": "2023-09-06-orc", "title": "Popular 1k columnar tables", "type": "orc", "url_safe_id": "2023-09-06-popular-1k-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2023-09-06-popular-1k-digestmap/", "anchor_url": "/derived/digestmap/#2023-09-06-popular-1k-digestmap", "annex_url": null, "category": "digestmap", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2023-09-06", "deprecated": false, "derived_of": "2023-09-06-popular-1k-compressed", "description": "Efficient mapping of content hashes (from SWHID to SHA1).", "export_url": "/exports/2023-09-06/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2023-09-06-popular-1k-digestmap", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2023-09-06-popular-1k/digestmap/ 2023-09-06-popular-1k-digestmap", "s3_url": "s3://softwareheritage/derived_datasets/2023-09-06-popular-1k/digestmap/", "size": "4.8 GiB", "slug": "2023-09-06-popular-1k", "swh_download_command": null, "tables": null, "tags": ["digestmap", "derived"], "teaser_of": null, "title": "Digestmap", "type": "digestmap", "url_safe_id": "2023-09-06-popular-1k-digestmap", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2022-12-07-compressed/", "anchor_url": "/graphs/compressed/#2022-12-07-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "author and committer timestamps were shifted back 1 or 2 hours, based on the Europe/Paris timezone, see https://gitlab.softwareheritage.org/swh/devel/swh-graph/-/issues/4788\n", "date": "2022-12-07", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2022-12-07/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2022-12-07-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2022-12-07/compressed/ 2022-12-07-compressed", "s3_url": "s3://softwareheritage/graph/2022-12-07/compressed/", "size": "7.1 TiB", "slug": "2022-12-07", "swh_download_command": "swh datasets download-graph 2022-12-07", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2022-12-07-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2022-12-07-orc/", "anchor_url": "/graphs/columnar/#2022-12-07-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2022-12-07", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2022-12-07/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2022-12-07-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2022-12-07/orc/ 2022-12-07-orc", "s3_url": "s3://softwareheritage/graph/2022-12-07/orc/", "size": "13 TiB", "slug": "2022-12-07", "swh_download_command": "swh datasets download-export 2022-12-07", "tables": null, "tags": ["orc"], "teaser_of": null, "title": "Graph export in columnar tables", "type": "orc", "url_safe_id": "2022-12-07-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2022-12-07-history-compressed/", "anchor_url": "/graphs/compressed/#2022-12-07-history-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "This is a compressed graph of only the \"history and hosting\" layer (origins, snapshots, releases, revisions) and the root directory (or rarely content) of every revision/release; but most directories and contents are excluded\n", "date": "2022-12-07", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2022-12-07/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2022-12-07-history-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2022-12-07-history/compressed/ 2022-12-07-history-compressed", "s3_url": "s3://softwareheritage/graph/2022-12-07-history/compressed/", "size": "1 TiB", "slug": "2022-12-07-history", "swh_download_command": "swh datasets download-graph 2022-12-07-history", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "History and hosting Compressed graph", "type": "compressed", "url_safe_id": "2022-12-07-history-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2022-12-07-path_counts/", "anchor_url": "/derived/path_counts/#2022-12-07-path_counts", "annex_url": null, "category": "path_counts", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2022-12-07", "deprecated": false, "derived_of": "2022-12-07-compressed", "description": "Associates each node to the number of paths from any node to that node.", "export_url": "/exports/2022-12-07/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2022-12-07-path_counts", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2022-12-07/path_counts_forward_ori,snp,rel,rev,dir,cnt/ 2022-12-07-path_counts", "s3_url": "s3://softwareheritage/derived_datasets/2022-12-07/path_counts_forward_ori,snp,rel,rev,dir,cnt/", "size": null, "slug": "2022-12-07", "swh_download_command": null, "tables": null, "tags": ["path_counts", "derived"], "teaser_of": null, "title": "Path counts", "type": "path_counts", "url_safe_id": "2022-12-07-path_counts", "variant": "forward_ori,snp,rel,rev,dir,cnt"}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2022-12-07-license/", "anchor_url": "/derived/license/#2022-12-07-license", "annex_url": "https://annex.softwareheritage.org/public/dataset/license-blobs/2022-12-07/", "category": "license", "citations": ["https://datasets.softwareheritage.org/Zacchiroli22.bib", "https://datasets.softwareheritage.org/GonzalezBarahonaLRZ23.bib"], "comments": null, "date": "2022-12-07", "deprecated": false, "derived_of": "2022-12-07-compressed", "description": "Dataset of the complete texts of free/open source software (FOSS) license variants.", "export_url": "/exports/2022-12-07/", "group": "derived", "hosted_on_annex": true, "hosted_on_s3": false, "id": "2022-12-07-license", "license": "CC-BY-4.0", "s3_download_command": null, "s3_url": null, "size": "16.4 GiB", "slug": "2022-12-07", "swh_download_command": null, "tables": null, "tags": ["license", "derived"], "teaser_of": null, "title": "License blobs", "type": "license", "url_safe_id": "2022-12-07-license", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2022-04-25-compressed/", "anchor_url": "/graphs/compressed/#2022-04-25-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": null, "date": "2022-04-25", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2022-04-25/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2022-04-25-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2022-04-25/compressed/ 2022-04-25-compressed", "s3_url": "s3://softwareheritage/graph/2022-04-25/compressed/", "size": "6.5 TiB", "slug": "2022-04-25", "swh_download_command": "swh datasets download-graph 2022-04-25", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2022-04-25-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2022-04-25-orc/", "anchor_url": "/graphs/columnar/#2022-04-25-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2022-04-25", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2022-04-25/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2022-04-25-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2022-04-25/orc/ 2022-04-25-orc", "s3_url": "s3://softwareheritage/graph/2022-04-25/orc/", "size": "11 TiB", "slug": "2022-04-25", "swh_download_command": "swh datasets download-export 2022-04-25", "tables": null, "tags": ["orc"], "teaser_of": null, "title": "Graph export in columnar tables", "type": "orc", "url_safe_id": "2022-04-25-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2022-04-25-popular_contents/", "anchor_url": "/derived/contents/#2022-04-25-popular_contents", "annex_url": null, "category": "contents", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": "A deprecated dataset listing the most popular name of each content. Replaced by the Aggregated Contents Dataset\n", "date": "2022-04-25", "deprecated": true, "derived_of": "2022-04-25-compressed", "description": "A deprecated dataset listing the most popular name of each content. Replaced by the Aggregated Contents Dataset.", "export_url": "/exports/2022-04-25/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2022-04-25-popular_contents", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2022-04-25/popular_contents/ 2022-04-25-popular_contents", "s3_url": "s3://softwareheritage/derived_datasets/2022-04-25/popular_contents/", "size": null, "slug": "2022-04-25", "swh_download_command": null, "tables": null, "tags": ["popular_contents", "derived"], "teaser_of": null, "title": "Popular Contents", "type": "popular_contents", "url_safe_id": "2022-04-25-popular_contents", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2022-04-25-path_counts/", "anchor_url": "/derived/path_counts/#2022-04-25-path_counts", "annex_url": null, "category": "path_counts", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2022-04-25", "deprecated": false, "derived_of": "2022-04-25-compressed", "description": "Associates each node to the number of paths from any node to that node.", "export_url": "/exports/2022-04-25/", "group": "derived", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2022-04-25-path_counts", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/derived_datasets/2022-04-25/path_counts_forward_ori,snp,rel,rev,dir,cnt/ 2022-04-25-path_counts", "s3_url": "s3://softwareheritage/derived_datasets/2022-04-25/path_counts_forward_ori,snp,rel,rev,dir,cnt/", "size": null, "slug": "2022-04-25", "swh_download_command": null, "tables": null, "tags": ["path_counts", "derived"], "teaser_of": null, "title": "Path counts", "type": "path_counts", "url_safe_id": "2022-04-25-path_counts", "variant": "forward_ori,snp,rel,rev,dir,cnt"}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2022-04-25-license/", "anchor_url": "/derived/license/#2022-04-25-license", "annex_url": "https://annex.softwareheritage.org/public/dataset/license-blobs/2022-04-25/", "category": "license", "citations": ["https://datasets.softwareheritage.org/Zacchiroli22.bib", "https://datasets.softwareheritage.org/GonzalezBarahonaLRZ23.bib"], "comments": null, "date": "2022-04-25", "deprecated": false, "derived_of": "2022-04-25-compressed", "description": "Dataset of the complete texts of free/open source software (FOSS) license variants.", "export_url": "/exports/2022-04-25/", "group": "derived", "hosted_on_annex": true, "hosted_on_s3": false, "id": "2022-04-25-license", "license": "CC-BY-4.0", "s3_download_command": null, "s3_url": null, "size": "15.2 GiB", "slug": "2022-04-25", "swh_download_command": null, "tables": null, "tags": ["license", "derived"], "teaser_of": null, "title": "License blobs", "type": "license", "url_safe_id": "2022-04-25-license", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2021-03-23-compressed/", "anchor_url": "/graphs/compressed/#2021-03-23-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": null, "date": "2021-03-23", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2021-03-23/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2021-03-23-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2021-03-23/compressed/ 2021-03-23-compressed", "s3_url": "s3://softwareheritage/graph/2021-03-23/compressed/", "size": null, "slug": "2021-03-23", "swh_download_command": "swh datasets download-graph 2021-03-23", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2021-03-23-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2021-03-23-orc/", "anchor_url": "/graphs/columnar/#2021-03-23-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": null, "date": "2021-03-23", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2021-03-23/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2021-03-23-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2021-03-23/orc/ 2021-03-23-orc", "s3_url": "s3://softwareheritage/graph/2021-03-23/orc/", "size": "8.4 TiB", "slug": "2021-03-23", "swh_download_command": "swh datasets download-export 2021-03-23", "tables": null, "tags": ["orc"], "teaser_of": null, "title": "Graph export in columnar tables", "type": "orc", "url_safe_id": "2021-03-23-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2021-03-23-popular-3k-python-compressed/", "anchor_url": "/teasers/compressed/#2021-03-23-popular-3k-python-compressed", "annex_url": null, "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "The popular-3k-python teaser contains a subset of 2197 popular repositories **tagged as being written in the Python language**, from GitHub, GitLab.com, PyPI and Debian. The selection criteria to pick the software origins was the following:\n\n- the 580 most popular GitHub projects written in Python (by number of stars),\n- the 135 GitLab.com projects written in Python that have 2 stars or more,\n- the 827 most popular PyPI projects (by usage statistics, according to the Top PyPI Packages database),\n- the 655 most popular Debian packages with the debtag implemented-in::python (by \"votes\" according to the Debian Popularity Contest database)\n", "date": "2021-03-23", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2021-03-23/", "group": "teasers", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2021-03-23-popular-3k-python-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2021-03-23-popular-3k-python/compressed/ 2021-03-23-popular-3k-python-compressed", "s3_url": "s3://softwareheritage/graph/2021-03-23-popular-3k-python/compressed/", "size": "15 GB", "slug": "2021-03-23-popular-3k-python", "swh_download_command": "swh datasets download-graph 2021-03-23-popular-3k-python", "tables": null, "tags": ["compressed", "teaser"], "teaser_of": "2021-03-23-compressed", "title": "Popular 3k python compressed graph", "type": "compressed", "url_safe_id": "2021-03-23-popular-3k-python-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2021-03-23-popular-3k-python-orc/", "anchor_url": "/teasers/columnar/#2021-03-23-popular-3k-python-orc", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": "The popular-3k-python teaser contains a subset of 2197 popular repositories **tagged as being written in the Python language**, from GitHub, GitLab.com, PyPI and Debian. The selection criteria to pick the software origins was the following:\n\n- the 580 most popular GitHub projects written in Python (by number of stars),\n- the 135 GitLab.com projects written in Python that have 2 stars or more,\n- the 827 most popular PyPI projects (by usage statistics, according to the Top PyPI Packages database),\n- the 655 most popular Debian packages with the debtag implemented-in::python (by \"votes\" according to the Debian Popularity Contest database)\n", "date": "2021-03-23", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2021-03-23/", "group": "teasers", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2021-03-23-popular-3k-python-orc", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2021-03-23-popular-3k-python/orc/ 2021-03-23-popular-3k-python-orc", "s3_url": "s3://softwareheritage/graph/2021-03-23-popular-3k-python/orc/", "size": "36 GB", "slug": "2021-03-23-popular-3k-python", "swh_download_command": "swh datasets download-export 2021-03-23-popular-3k-python", "tables": null, "tags": ["orc", "teaser"], "teaser_of": "2021-03-23-orc", "title": "Popular 3k python columnar tables", "type": "orc", "url_safe_id": "2021-03-23-popular-3k-python-orc", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2021-03-23-license/", "anchor_url": "/derived/license/#2021-03-23-license", "annex_url": "https://annex.softwareheritage.org/public/dataset/license-blobs/2021-03-23/", "category": "license", "citations": ["https://datasets.softwareheritage.org/Zacchiroli22.bib", "https://datasets.softwareheritage.org/GonzalezBarahonaLRZ23.bib"], "comments": null, "date": "2021-03-23", "deprecated": false, "derived_of": "2021-03-23-compressed", "description": "Dataset of the complete texts of free/open source software (FOSS) license variants.", "export_url": "/exports/2021-03-23/", "group": "derived", "hosted_on_annex": true, "hosted_on_s3": false, "id": "2021-03-23-license", "license": "CC-BY-4.0", "s3_download_command": null, "s3_url": null, "size": "14.1 GiB", "slug": "2021-03-23", "swh_download_command": null, "tables": null, "tags": ["license", "derived"], "teaser_of": null, "title": "License blobs", "type": "license", "url_safe_id": "2021-03-23-license", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2020-12-15-compressed/", "anchor_url": "/graphs/compressed/#2020-12-15-compressed", "annex_url": "https://annex.softwareheritage.org/public/dataset/graph/2020-12-15/compressed/", "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": null, "date": "2020-12-15", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2020-12-15/", "group": "graphs", "hosted_on_annex": true, "hosted_on_s3": true, "id": "2020-12-15-compressed", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2020-12-15/compressed/ 2020-12-15-compressed", "s3_url": "s3://softwareheritage/graph/2020-12-15/compressed/", "size": null, "slug": "2020-12-15", "swh_download_command": "swh datasets download-graph 2020-12-15", "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2020-12-15-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2020-12-15-csv/", "anchor_url": "/graphs/columnar/#2020-12-15-csv", "annex_url": null, "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": "- edges as graph.edges.{cnt,ori,rel,rev,snp}.csv.zst and graph.edges.dir.{00..21}.csv.zst\n- nodes as graph.nodes.csv.zst\n- deduplicated labels as graph.labels.csv.zst\n- statistics as graph.edges.count.txt, graph.edges.stats.txt, graph.labels.count.txt, graph.nodes.count.txt, and graph.nodes.stats.txt\n", "date": "2020-12-15", "deprecated": true, "derived_of": null, "description": "This export has a CSV representation of nodes and edges instead of columnar.", "export_url": "/exports/2020-12-15/", "group": "graphs", "hosted_on_annex": false, "hosted_on_s3": true, "id": "2020-12-15-csv", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2020-12-15/csv/ 2020-12-15-csv", "s3_url": "s3://softwareheritage/graph/2020-12-15/csv/", "size": "8.4 TiB", "slug": "2020-12-15", "swh_download_command": null, "tables": null, "tags": ["csv"], "teaser_of": null, "title": "Graph export in CSV", "type": "csv", "url_safe_id": "2020-12-15-csv", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2020-12-15-gitlab-100k-compressed/", "anchor_url": "/teasers/compressed/#2020-12-15-gitlab-100k-compressed", "annex_url": "https://annex.softwareheritage.org/public/dataset/graph/2020-12-15-gitlab-100k/compressed/", "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "A teaser dataset containing the 100k most popular GitLab.com repositories\n", "date": "2020-12-15", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2020-12-15/", "group": "teasers", "hosted_on_annex": true, "hosted_on_s3": false, "id": "2020-12-15-gitlab-100k-compressed", "license": "CC-BY-4.0", "s3_download_command": null, "s3_url": null, "size": null, "slug": "2020-12-15-gitlab-100k", "swh_download_command": null, "tables": null, "tags": ["compressed", "teaser"], "teaser_of": "2020-12-15-compressed", "title": "GitLab 100k compressed graph", "type": "compressed", "url_safe_id": "2020-12-15-gitlab-100k-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2020-12-15-gitlab-all-compressed/", "anchor_url": "/teasers/compressed/#2020-12-15-gitlab-all-compressed", "annex_url": "https://annex.softwareheritage.org/public/dataset/graph/2020-12-15-gitlab-all/compressed/", "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "A teaser dataset containing the entirety of GitLab.com\n", "date": "2020-12-15", "deprecated": false, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2020-12-15/", "group": "teasers", "hosted_on_annex": true, "hosted_on_s3": false, "id": "2020-12-15-gitlab-all-compressed", "license": "CC-BY-4.0", "s3_download_command": null, "s3_url": null, "size": null, "slug": "2020-12-15-gitlab-all", "swh_download_command": null, "tables": null, "tags": ["compressed", "teaser"], "teaser_of": "2020-12-15-compressed", "title": "GitLab all compressed graph", "type": "compressed", "url_safe_id": "2020-12-15-gitlab-all-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2020-05-20-compressed/", "anchor_url": "/graphs/compressed/#2020-05-20-compressed", "annex_url": "https://annex.softwareheritage.org/public/dataset/graph/2020-05-20/compressed/", "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "DEPRECATED: known issue with missing snapshot edges\n", "date": "2020-05-20", "deprecated": true, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2020-05-20/", "group": "graphs", "hosted_on_annex": true, "hosted_on_s3": false, "id": "2020-05-20-compressed", "license": "CC-BY-4.0", "s3_download_command": null, "s3_url": null, "size": null, "slug": "2020-05-20", "swh_download_command": null, "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2020-05-20-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2018-09-25-compressed/", "anchor_url": "/graphs/compressed/#2018-09-25-compressed", "annex_url": "https://annex.softwareheritage.org/public/dataset/graph/2018-09-25/compressed/", "category": "compressed", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib", "https://datasets.softwareheritage.org/msr-2019-swh.bib"], "comments": "A full export of the graph dated from January 2019. The export was done in two phases, one of them called \"2018-09-25\" and the other \"2019-01-28\". They both refer to the same dataset, but the different formats have various inconsistencies between them.\n", "date": "2018-09-25", "deprecated": true, "derived_of": null, "description": "A compact and highly-efficient representation of the graph dataset, suited for scale-up analysis on high-end machines with large amounts of memory. The graph is compressed in Boldi-Vigna representation, designed to be loaded by the WebGraph framework, specifically using our swh-graph library.", "export_url": "/exports/2018-09-25/", "group": "graphs", "hosted_on_annex": true, "hosted_on_s3": false, "id": "2018-09-25-compressed", "license": "CC-BY-4.0", "s3_download_command": null, "s3_url": null, "size": null, "slug": "2018-09-25", "swh_download_command": null, "tables": null, "tags": ["compressed"], "teaser_of": null, "title": "Full compressed graph", "type": "compressed", "url_safe_id": "2018-09-25-compressed", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2018-09-25-parquet/", "anchor_url": "/graphs/columnar/#2018-09-25-parquet", "annex_url": "https://annex.softwareheritage.org/public/dataset/graph/2018-09-25/parquet/", "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": "A full export of the graph dated from January 2019. The export was done in two phases, one of them called \"2018-09-25\" and the other \"2019-01-28\". They both refer to the same dataset, but the different formats have various inconsistencies between them.\n", "date": "2018-09-25", "deprecated": true, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2018-09-25/", "group": "graphs", "hosted_on_annex": true, "hosted_on_s3": true, "id": "2018-09-25-parquet", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2018-09-25/parquet/ 2018-09-25-parquet", "s3_url": "s3://softwareheritage/graph/2018-09-25/parquet/", "size": "1.2 TiB", "slug": "2018-09-25", "swh_download_command": null, "tables": null, "tags": ["parquet"], "teaser_of": null, "title": "Graph export in columnar tables", "type": "parquet", "url_safe_id": "2018-09-25-parquet", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2019-01-28-popular-4k-parquet/", "anchor_url": "/teasers/columnar/#2019-01-28-popular-4k-parquet", "annex_url": "https://annex.softwareheritage.org/public/dataset/graph/2019-01-28-popular-4k/parquet/", "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": "This teaser dataset contains a subset of 4000 popular repositories from GitHub, GitLab.com, PyPI and Debian. The selection criteria to pick the software origins was the following:\n\n- The 1000 most popular GitHub projects (by number of stars)\n- The 1000 most popular GitLab.com projects (by number of stars)\n- The 1000 most popular PyPI projects (by usage statistics, according to the Top PyPI Packages database),\n- The 1000 most popular Debian packages (by \"votes\" according to the Debian Popularity Contest database)\n", "date": "2019-01-28", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2019-01-28/", "group": "teasers", "hosted_on_annex": true, "hosted_on_s3": true, "id": "2019-01-28-popular-4k-parquet", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2019-01-28-popular-4k/parquet/ 2019-01-28-popular-4k-parquet", "s3_url": "s3://softwareheritage/graph/2019-01-28-popular-4k/parquet/", "size": "27 GB", "slug": "2019-01-28-popular-4k", "swh_download_command": null, "tables": null, "tags": ["parquet", "teaser"], "teaser_of": "2018-09-25-parquet", "title": "Popular 4k columnar tables", "type": "parquet", "url_safe_id": "2019-01-28-popular-4k-parquet", "variant": null}, {"absolute_url": "https://datasets.softwareheritage.org/datasets/2019-01-28-popular-3k-python-parquet/", "anchor_url": "/teasers/columnar/#2019-01-28-popular-3k-python-parquet", "annex_url": "https://annex.softwareheritage.org/public/dataset/graph/2019-01-28-popular-3k-python/parquet/", "category": "columnar", "citations": ["https://datasets.softwareheritage.org/AbramaticCZ18.bib", "https://datasets.softwareheritage.org/CosmoZ17.bib"], "comments": "The popular-3k-python teaser contains a subset of 3052 popular repositories tagged as being written in the Python language, from GitHub, GitLab.com, PyPI and Debian. The selection criteria to pick the software origins was the following, similar to popular-4k:\n\n- the 1000 most popular GitHub projects written in Python (by number of stars),\n- the 131 GitLab.com projects written in Python that have 2 stars or more,\n- the 1000 most popular PyPI projects (by usage statistics, according to the Top PyPI Packages database),\n- the 1000 most popular Debian packages with the debtag implemented-in::python (by \"votes\" according to the Debian Popularity Contest database).\n", "date": "2019-01-28", "deprecated": false, "derived_of": null, "description": "A set of relational tables stored in a columnar format such as Apache ORC, which is particularly suited for scale-out analyses on data lakes and big data processing ecosystems such as the Hadoop environment.", "export_url": "/exports/2019-01-28/", "group": "teasers", "hosted_on_annex": true, "hosted_on_s3": true, "id": "2019-01-28-popular-3k-python-parquet", "license": "CC-BY-4.0", "s3_download_command": "aws s3 cp --recursive --no-sign-request s3://softwareheritage/graph/2019-01-28-popular-3k-python/parquet/ 2019-01-28-popular-3k-python-parquet", "s3_url": "s3://softwareheritage/graph/2019-01-28-popular-3k-python/parquet/", "size": "5.3 GB", "slug": "2019-01-28-popular-3k-python", "swh_download_command": null, "tables": null, "tags": ["parquet", "teaser"], "teaser_of": "2018-09-25-parquet", "title": "Popular 3k python columnar tables", "type": "parquet", "url_safe_id": "2019-01-28-popular-3k-python-parquet", "variant": null}]