Get the number of rows and the size in bytes
This guide shows you how to use Datasets Server’s /size
endpoint to retrieve a dataset’s size programmatically. Feel free to also try it out with ReDoc.
The /size
endpoint accepts the dataset name as its query parameter:
Python
JavaScript
cURL
import requests
headers = {"Authorization": f"Bearer {API_TOKEN}"}
API_URL = "https://datasets-server.huggingface.co/size?dataset=duorc"
def query():
response = requests.get(API_URL, headers=headers)
return response.json()
data = query()
The endpoint response is a JSON containing the size of the dataset, as well as each of its configurations and splits. It provides the number of rows, the number of colums (where applicable) and the size in bytes for the different forms of the data: original files, size in memory (RAM) and auto-converted parquet files. For example, the duorc dataset has 187.213 rows along all its configurations and splits, for a total of 97MB.
{
"size": {
"dataset": {
"dataset": "duorc",
"num_bytes_original_files": 97383710,
"num_bytes_parquet_files": 58710973,
"num_bytes_memory": 1059067116,
"num_rows": 187213
},
"configs": [
{
"dataset": "duorc",
"config": "ParaphraseRC",
"num_bytes_original_files": 62921050,
"num_bytes_parquet_files": 37709127,
"num_bytes_memory": 718409158,
"num_rows": 100972,
"num_columns": 7
},
{
"dataset": "duorc",
"config": "SelfRC",
"num_bytes_original_files": 34462660,
"num_bytes_parquet_files": 21001846,
"num_bytes_memory": 340657958,
"num_rows": 86241,
"num_columns": 7
}
],
"splits": [
{
"dataset": "duorc",
"config": "ParaphraseRC",
"split": "train",
"num_bytes_parquet_files": 26005668,
"num_bytes_memory": 496682909,
"num_rows": 69524,
"num_columns": 7
},
{
"dataset": "duorc",
"config": "ParaphraseRC",
"split": "validation",
"num_bytes_parquet_files": 5566868,
"num_bytes_memory": 106510489,
"num_rows": 15591,
"num_columns": 7
},
{
"dataset": "duorc",
"config": "ParaphraseRC",
"split": "test",
"num_bytes_parquet_files": 6136591,
"num_bytes_memory": 115215760,
"num_rows": 15857,
"num_columns": 7
},
{
"dataset": "duorc",
"config": "SelfRC",
"split": "train",
"num_bytes_parquet_files": 14851720,
"num_bytes_memory": 239852729,
"num_rows": 60721,
"num_columns": 7
},
{
"dataset": "duorc",
"config": "SelfRC",
"split": "validation",
"num_bytes_parquet_files": 3114390,
"num_bytes_memory": 51662519,
"num_rows": 12961,
"num_columns": 7
},
{
"dataset": "duorc",
"config": "SelfRC",
"split": "test",
"num_bytes_parquet_files": 3035736,
"num_bytes_memory": 49142710,
"num_rows": 12559,
"num_columns": 7
}
]
},
"pending": [],
"failed": [],
"partial": false
}