gem_id
string
| gem_parent_id
string
| concept_set_id
int32
0
32.7k
| concepts
list
| target
string
| references
list
|
---|---|---|---|---|---|
"common_gen-train-0" | "common_gen-train-0" | 0 | [
"mountain",
"ski",
"skier"
] | "Skier skis down the mountain" | [] |
"common_gen-train-1" | "common_gen-train-1" | 0 | [
"mountain",
"ski",
"skier"
] | "A skier is skiing down a mountain." | [] |
"common_gen-train-2" | "common_gen-train-2" | 0 | [
"mountain",
"ski",
"skier"
] | "Three skiers are skiing on a snowy mountain." | [] |
"common_gen-train-3" | "common_gen-train-3" | 1 | [
"dog",
"tail",
"wag"
] | "The dog is wagging his tail." | [] |
"common_gen-train-4" | "common_gen-train-4" | 1 | [
"dog",
"tail",
"wag"
] | "A dog wags his tail at the boy." | [] |
"common_gen-train-5" | "common_gen-train-5" | 1 | [
"dog",
"tail",
"wag"
] | "a dog wags its tail with its heart" | [] |
"common_gen-train-6" | "common_gen-train-6" | 2 | [
"canoe",
"lake",
"paddle"
] | "woman paddling canoe on a lake" | [] |
"common_gen-train-7" | "common_gen-train-7" | 2 | [
"canoe",
"lake",
"paddle"
] | "paddle an open canoe along lake ." | [] |
"common_gen-train-8" | "common_gen-train-8" | 2 | [
"canoe",
"lake",
"paddle"
] | "a man paddles his canoe on the lake." | [] |
"common_gen-train-9" | "common_gen-train-9" | 3 | [
"pull",
"station",
"train"
] | "a train pulls into station" | [] |
"common_gen-train-10" | "common_gen-train-10" | 3 | [
"pull",
"station",
"train"
] | "train pulling in to station ." | [] |
"common_gen-train-11" | "common_gen-train-11" | 3 | [
"pull",
"station",
"train"
] | "the train pulling into station" | [] |
"common_gen-train-12" | "common_gen-train-12" | 4 | [
"eat",
"hay",
"horse"
] | "A horse is eating hay." | [] |
"common_gen-train-13" | "common_gen-train-13" | 4 | [
"eat",
"hay",
"horse"
] | "The horses are eating hay." | [] |
"common_gen-train-14" | "common_gen-train-14" | 4 | [
"eat",
"hay",
"horse"
] | "A horse eats hay in the barn" | [] |
"common_gen-train-15" | "common_gen-train-15" | 5 | [
"fan",
"match",
"watch"
] | "watch a match with fans" | [] |
"common_gen-train-16" | "common_gen-train-16" | 5 | [
"fan",
"match",
"watch"
] | "the fans watch the match" | [] |
"common_gen-train-17" | "common_gen-train-17" | 5 | [
"fan",
"match",
"watch"
] | "a fan watches during the match" | [] |
"common_gen-train-18" | "common_gen-train-18" | 6 | [
"lake",
"mountain",
"surround"
] | "a lake surrounded by mountains ." | [] |
"common_gen-train-19" | "common_gen-train-19" | 6 | [
"lake",
"mountain",
"surround"
] | "lake from the surrounding mountains" | [] |
"common_gen-train-20" | "common_gen-train-20" | 6 | [
"lake",
"mountain",
"surround"
] | "one of the mountain ranges that surrounds lake ." | [] |
"common_gen-train-21" | "common_gen-train-21" | 7 | [
"dog",
"lay",
"rug"
] | "A dog laying on a rug." | [] |
"common_gen-train-22" | "common_gen-train-22" | 7 | [
"dog",
"lay",
"rug"
] | "The dogs laid down on the rug" | [] |
"common_gen-train-23" | "common_gen-train-23" | 7 | [
"dog",
"lay",
"rug"
] | "Brown dog chews on bone while laying on the rug." | [] |
"common_gen-train-24" | "common_gen-train-24" | 8 | [
"hang",
"painting",
"wall"
] | "hanging a painting on a wall at home" | [] |
"common_gen-train-25" | "common_gen-train-25" | 8 | [
"hang",
"painting",
"wall"
] | "paintings of horses hang on the walls ." | [] |
"common_gen-train-26" | "common_gen-train-26" | 8 | [
"hang",
"painting",
"wall"
] | "There is only one painting hanging on the wall." | [] |
"common_gen-train-27" | "common_gen-train-27" | 9 | [
"carry",
"food",
"tray"
] | "boy carries a tray of food ." | [] |
"common_gen-train-28" | "common_gen-train-28" | 9 | [
"carry",
"food",
"tray"
] | "people carrying food on trays" | [] |
"common_gen-train-29" | "common_gen-train-29" | 9 | [
"carry",
"food",
"tray"
] | "The woman is carrying two trays of food." | [] |
"common_gen-train-30" | "common_gen-train-30" | 10 | [
"match",
"stadium",
"watch"
] | "soccer fans watches a league match in a stadium" | [] |
"common_gen-train-31" | "common_gen-train-31" | 10 | [
"match",
"stadium",
"watch"
] | "A stadium full of people watching a tennis match." | [] |
"common_gen-train-32" | "common_gen-train-32" | 10 | [
"match",
"stadium",
"watch"
] | "supporters watch the match from a hill outside the stadium" | [] |
"common_gen-train-33" | "common_gen-train-33" | 11 | [
"cat",
"lick",
"paw"
] | "A cat licks his paws." | [] |
"common_gen-train-34" | "common_gen-train-34" | 11 | [
"cat",
"lick",
"paw"
] | "A cat is licking its paw" | [] |
"common_gen-train-35" | "common_gen-train-35" | 11 | [
"cat",
"lick",
"paw"
] | "the cat licks the pad of his front paw" | [] |
"common_gen-train-36" | "common_gen-train-36" | 12 | [
"room",
"tile",
"wall"
] | "a bath room with a toilet and tiled walls" | [] |
"common_gen-train-37" | "common_gen-train-37" | 12 | [
"room",
"tile",
"wall"
] | "Three men tile a wall in a large empty room" | [] |
"common_gen-train-38" | "common_gen-train-38" | 12 | [
"room",
"tile",
"wall"
] | "A wall mounted urinal in a checker tiled rest room." | [] |
"common_gen-train-39" | "common_gen-train-39" | 13 | [
"canoe",
"lake",
"shore"
] | "canoe on a shore of lake ." | [] |
"common_gen-train-40" | "common_gen-train-40" | 13 | [
"canoe",
"lake",
"shore"
] | "canoe on shore with rainbow across the lake" | [] |
"common_gen-train-41" | "common_gen-train-41" | 13 | [
"canoe",
"lake",
"shore"
] | "Several canoes parked in the grass on the shore of a lake " | [] |
"common_gen-train-42" | "common_gen-train-42" | 14 | [
"mountain",
"skier",
"way"
] | "A skier on his way to the mountain." | [] |
"common_gen-train-43" | "common_gen-train-43" | 14 | [
"mountain",
"skier",
"way"
] | "skiers make their way down the mountain" | [] |
"common_gen-train-44" | "common_gen-train-44" | 14 | [
"mountain",
"skier",
"way"
] | "A skier making her way down a snowy mountain." | [] |
"common_gen-train-45" | "common_gen-train-45" | 15 | [
"boat",
"drive",
"lake"
] | "driving boat on a lake" | [] |
"common_gen-train-46" | "common_gen-train-46" | 15 | [
"boat",
"drive",
"lake"
] | "a boat is being driven through a lake" | [] |
"common_gen-train-47" | "common_gen-train-47" | 15 | [
"boat",
"drive",
"lake"
] | "A fisherman drives his boat on the lake" | [] |
"common_gen-train-48" | "common_gen-train-48" | 16 | [
"eat",
"grass",
"horse"
] | "A horse is eating grass." | [] |
"common_gen-train-49" | "common_gen-train-49" | 16 | [
"eat",
"grass",
"horse"
] | "The horses are eating grass." | [] |
"common_gen-train-50" | "common_gen-train-50" | 16 | [
"eat",
"grass",
"horse"
] | "The old horse ate grass all day." | [] |
"common_gen-train-51" | "common_gen-train-51" | 17 | [
"come",
"track",
"train"
] | "train coming down the track" | [] |
"common_gen-train-52" | "common_gen-train-52" | 17 | [
"come",
"track",
"train"
] | "A train is coming along on a track." | [] |
"common_gen-train-53" | "common_gen-train-53" | 17 | [
"come",
"track",
"train"
] | "a long train in coming down some tracks" | [] |
"common_gen-train-54" | "common_gen-train-54" | 18 | [
"move",
"track",
"train"
] | "train moving on the tracks" | [] |
"common_gen-train-55" | "common_gen-train-55" | 18 | [
"move",
"track",
"train"
] | "A red train is moving down a track" | [] |
"common_gen-train-56" | "common_gen-train-56" | 18 | [
"move",
"track",
"train"
] | "A train moves slowly on some empty tracks" | [] |
"common_gen-train-57" | "common_gen-train-57" | 19 | [
"leave",
"station",
"train"
] | "a train leaves the station" | [] |
"common_gen-train-58" | "common_gen-train-58" | 19 | [
"leave",
"station",
"train"
] | "a train leaving station bound" | [] |
"common_gen-train-59" | "common_gen-train-59" | 19 | [
"leave",
"station",
"train"
] | "a fast train about to leave station" | [] |
"common_gen-train-60" | "common_gen-train-60" | 20 | [
"passenger",
"station",
"train"
] | "train and passengers at the station" | [] |
"common_gen-train-61" | "common_gen-train-61" | 20 | [
"passenger",
"station",
"train"
] | "passengers leaving a train on a station" | [] |
"common_gen-train-62" | "common_gen-train-62" | 20 | [
"passenger",
"station",
"train"
] | "a train at station with no passengers joining" | [] |
"common_gen-train-63" | "common_gen-train-63" | 21 | [
"arrive",
"station",
"train"
] | "a train arrives at station" | [] |
"common_gen-train-64" | "common_gen-train-64" | 21 | [
"arrive",
"station",
"train"
] | "train arriving at the station" | [] |
"common_gen-train-65" | "common_gen-train-65" | 21 | [
"arrive",
"station",
"train"
] | "subway train arrives in the station" | [] |
"common_gen-train-66" | "common_gen-train-66" | 22 | [
"sit",
"station",
"train"
] | "a train sits at the station" | [] |
"common_gen-train-67" | "common_gen-train-67" | 22 | [
"sit",
"station",
"train"
] | "A train that is sitting in a station." | [] |
"common_gen-train-68" | "common_gen-train-68" | 22 | [
"sit",
"station",
"train"
] | "A red train sitting at an empty station." | [] |
"common_gen-train-69" | "common_gen-train-69" | 23 | [
"horse",
"pull",
"wagon"
] | "a tea of horses pull a wagon" | [] |
"common_gen-train-70" | "common_gen-train-70" | 23 | [
"horse",
"pull",
"wagon"
] | "horse pulling man on wagon ." | [] |
"common_gen-train-71" | "common_gen-train-71" | 23 | [
"horse",
"pull",
"wagon"
] | "A wagon is being pulled by horses." | [] |
"common_gen-train-72" | "common_gen-train-72" | 24 | [
"station",
"stop",
"train"
] | "train is stopped at a station" | [] |
"common_gen-train-73" | "common_gen-train-73" | 24 | [
"station",
"stop",
"train"
] | "trains stopping at the station" | [] |
"common_gen-train-74" | "common_gen-train-74" | 24 | [
"station",
"stop",
"train"
] | "The empty train is stopped in the station." | [] |
"common_gen-train-75" | "common_gen-train-75" | 25 | [
"plane",
"runway",
"sit"
] | "A plane sits on the runway" | [] |
"common_gen-train-76" | "common_gen-train-76" | 25 | [
"plane",
"runway",
"sit"
] | "An old plane is sitting on a runway." | [] |
"common_gen-train-77" | "common_gen-train-77" | 25 | [
"plane",
"runway",
"sit"
] | "Two planes are sitting out on the runway." | [] |
"common_gen-train-78" | "common_gen-train-78" | 26 | [
"cloud",
"fly",
"plane"
] | "plane flying into the clouds" | [] |
"common_gen-train-79" | "common_gen-train-79" | 26 | [
"cloud",
"fly",
"plane"
] | "flying plane against a cloud ." | [] |
"common_gen-train-80" | "common_gen-train-80" | 26 | [
"cloud",
"fly",
"plane"
] | "A plane flies over head in the clouds." | [] |
"common_gen-train-81" | "common_gen-train-81" | 27 | [
"dog",
"herd",
"sheep"
] | "A dog herds a sheep." | [] |
"common_gen-train-82" | "common_gen-train-82" | 27 | [
"dog",
"herd",
"sheep"
] | "A dog is herding sheep." | [] |
"common_gen-train-83" | "common_gen-train-83" | 27 | [
"dog",
"herd",
"sheep"
] | "The dogs are herding sheep." | [] |
"common_gen-train-84" | "common_gen-train-84" | 28 | [
"beach",
"boat",
"sit"
] | "boats sitting on the beach" | [] |
"common_gen-train-85" | "common_gen-train-85" | 28 | [
"beach",
"boat",
"sit"
] | "a boat is sitting up on a beach" | [] |
"common_gen-train-86" | "common_gen-train-86" | 28 | [
"beach",
"boat",
"sit"
] | "Pelicans sit on a blue boat at the beach." | [] |
"common_gen-train-87" | "common_gen-train-87" | 29 | [
"come",
"station",
"train"
] | "a train coming into station" | [] |
"common_gen-train-88" | "common_gen-train-88" | 29 | [
"come",
"station",
"train"
] | "tube train comes to station ." | [] |
"common_gen-train-89" | "common_gen-train-89" | 29 | [
"come",
"station",
"train"
] | "train coming in to the station" | [] |
"common_gen-train-90" | "common_gen-train-90" | 30 | [
"cloud",
"float",
"sky"
] | "clouds floating in the sky" | [] |
"common_gen-train-91" | "common_gen-train-91" | 30 | [
"cloud",
"float",
"sky"
] | "clouds float through a blue sky" | [] |
"common_gen-train-92" | "common_gen-train-92" | 30 | [
"cloud",
"float",
"sky"
] | "shot of clouds that float across the sky" | [] |
"common_gen-train-93" | "common_gen-train-93" | 31 | [
"eat",
"elephant",
"grass"
] | "elephants pulling grass to eat ." | [] |
"common_gen-train-94" | "common_gen-train-94" | 31 | [
"eat",
"elephant",
"grass"
] | "An elephant is eating grass in Kenya." | [] |
"common_gen-train-95" | "common_gen-train-95" | 31 | [
"eat",
"elephant",
"grass"
] | "a bunch of elephants are eating grass" | [] |
"common_gen-train-96" | "common_gen-train-96" | 32 | [
"family",
"spend",
"time"
] | "family spend time in the park" | [] |
"common_gen-train-97" | "common_gen-train-97" | 32 | [
"family",
"spend",
"time"
] | "spending time with the family" | [] |
"common_gen-train-98" | "common_gen-train-98" | 32 | [
"family",
"spend",
"time"
] | "family spend time at a holidays" | [] |
"common_gen-train-99" | "common_gen-train-99" | 33 | [
"bathroom",
"tile",
"wall"
] | "black walls and tiles in the bathroom" | [] |
Dataset Card for GEM
Dataset Summary
GEM is a benchmark environment for Natural Language Generation with a focus on its Evaluation, both through human annotations and automated Metrics.
GEM aims to:
- measure NLG progress across 13 datasets spanning many NLG tasks and languages.
- provide an in-depth analysis of data and models presented via data statements and challenge sets.
- develop standards for evaluation of generated text using both automated and human metrics.
It is our goal to regularly update GEM and to encourage toward more inclusive practices in dataset development by extending existing data or developing datasets for additional languages.
You can find more complete information in the dataset cards for each of the subsets:
- CommonGen
- Czech Restaurant
- DART
- E2E
- MLSum
- Schema-Guided Dialog
- WebNLG
- Wiki-Auto/ASSET/TURK
- WikiLingua
- XSum
The subsets are organized by task:
{
"summarization": {
"mlsum": ["mlsum_de", "mlsum_es"],
"wiki_lingua": ["wiki_lingua_es_en", "wiki_lingua_ru_en", "wiki_lingua_tr_en", "wiki_lingua_vi_en"],
"xsum": ["xsum"],
},
"struct2text": {
"common_gen": ["common_gen"],
"cs_restaurants": ["cs_restaurants"],
"dart": ["dart"],
"e2e": ["e2e_nlg"],
"totto": ["totto"],
"web_nlg": ["web_nlg_en", "web_nlg_ru"],
},
"simplification": {
"wiki_auto_asset_turk": ["wiki_auto_asset_turk"],
},
"dialog": {
"schema_guided_dialog": ["schema_guided_dialog"],
},
}
Each example has one target
per example in its training set, and a set of references
(with one or more items) in its validation and test set.
Supported Tasks and Leaderboards
Languages
Dataset Structure
Data Instances
common_gen
- Size of downloaded dataset files: 1.85 MB
- Size of the generated dataset: 9.23 MB
- Total amount of disk used: 11.07 MB
An example of validation
looks as follows.
{'concept_set_id': 0,
'concepts': ['field', 'look', 'stand'],
'gem_id': 'common_gen-validation-0',
'references': ['The player stood in the field looking at the batter.',
'The coach stands along the field, looking at the goalkeeper.',
'I stood and looked across the field, peacefully.',
'Someone stands, looking around the empty field.'],
'target': 'The player stood in the field looking at the batter.'}
cs_restaurants
- Size of downloaded dataset files: 1.47 MB
- Size of the generated dataset: 1.31 MB
- Total amount of disk used: 2.77 MB
An example of validation
looks as follows.
{'dialog_act': '?request(area)',
'dialog_act_delexicalized': '?request(area)',
'gem_id': 'cs_restaurants-validation-0',
'references': ['Jakou lokalitu hledáte ?'],
'target': 'Jakou lokalitu hledáte ?',
'target_delexicalized': 'Jakou lokalitu hledáte ?'}
dart
- Size of downloaded dataset files: 29.37 MB
- Size of the generated dataset: 27.44 MB
- Total amount of disk used: 56.81 MB
An example of validation
looks as follows.
{'dart_id': 0,
'gem_id': 'dart-validation-0',
'references': ['A school from Mars Hill, North Carolina, joined in 1973.'],
'subtree_was_extended': True,
'target': 'A school from Mars Hill, North Carolina, joined in 1973.',
'target_sources': ['WikiSQL_decl_sents'],
'tripleset': [['Mars Hill College', 'JOINED', '1973'], ['Mars Hill College', 'LOCATION', 'Mars Hill, North Carolina']]}
e2e_nlg
- Size of downloaded dataset files: 14.60 MB
- Size of the generated dataset: 12.14 MB
- Total amount of disk used: 26.74 MB
An example of validation
looks as follows.
{'gem_id': 'e2e_nlg-validation-0',
'meaning_representation': 'name[Alimentum], area[city centre], familyFriendly[no]',
'references': ['There is a place in the city centre, Alimentum, that is not family-friendly.'],
'target': 'There is a place in the city centre, Alimentum, that is not family-friendly.'}
mlsum_de
- Size of downloaded dataset files: 347.36 MB
- Size of the generated dataset: 951.06 MB
- Total amount of disk used: 1.30 GB
An example of validation
looks as follows.
{'date': '00/04/2019',
'gem_id': 'mlsum_de-validation-0',
'references': ['In einer Kleinstadt auf der Insel Usedom war eine junge Frau tot in ihrer Wohnung gefunden worden. Nun stehen zwei Bekannte unter Verdacht.'],
'target': 'In einer Kleinstadt auf der Insel Usedom war eine junge Frau tot in ihrer Wohnung gefunden worden. Nun stehen zwei Bekannte unter Verdacht.',
'text': 'Kerzen und Blumen stehen vor dem Eingang eines Hauses, in dem eine 18-jährige Frau tot aufgefunden wurde. In einer Kleinstadt auf der Insel Usedom war eine junge Frau tot in ...',
'title': 'Tod von 18-Jähriger auf Usedom: Zwei Festnahmen',
'topic': 'panorama',
'url': 'https://www.sueddeutsche.de/panorama/usedom-frau-tot-festnahme-verdaechtige-1.4412256'}
mlsum_es
- Size of downloaded dataset files: 514.11 MB
- Size of the generated dataset: 1.31 GB
- Total amount of disk used: 1.83 GB
An example of validation
looks as follows.
{'date': '05/01/2019',
'gem_id': 'mlsum_es-validation-0',
'references': ['El diseñador que dio carta de naturaleza al estilo genuinamente americano celebra el medio siglo de su marca entre grandes fastos y problemas financieros. Conectar con las nuevas generaciones es el regalo que precisa más que nunca'],
'target': 'El diseñador que dio carta de naturaleza al estilo genuinamente americano celebra el medio siglo de su marca entre grandes fastos y problemas financieros. Conectar con las nuevas generaciones es el regalo que precisa más que nunca',
'text': 'Un oso de peluche marcándose un heelflip de monopatín es todo lo que Ralph Lauren necesitaba esta Navidad. Estampado en un jersey de lana azul marino, supone la guinda que corona ...',
'title': 'Ralph Lauren busca el secreto de la eterna juventud',
'topic': 'elpais estilo',
'url': 'http://elpais.com/elpais/2019/01/04/estilo/1546617396_933318.html'}
schema_guided_dialog
- Size of downloaded dataset files: 8.64 MB
- Size of the generated dataset: 45.78 MB
- Total amount of disk used: 54.43 MB
An example of validation
looks as follows.
{'dialog_acts': [{'act': 2, 'slot': 'song_name', 'values': ['Carnivore']}, {'act': 2, 'slot': 'playback_device', 'values': ['TV']}],
'dialog_id': '10_00054',
'gem_id': 'schema_guided_dialog-validation-0',
'prompt': 'Yes, I would.',
'references': ['Please confirm the song Carnivore on tv.'],
'target': 'Please confirm the song Carnivore on tv.',
'turn_id': 15}
totto
- Size of downloaded dataset files: 187.73 MB
- Size of the generated dataset: 757.99 MB
- Total amount of disk used: 945.72 MB
An example of validation
looks as follows.
{'example_id': '7391450717765563190',
'gem_id': 'totto-validation-0',
'highlighted_cells': [[3, 0], [3, 2], [3, 3]],
'overlap_subset': 'True',
'references': ['Daniel Henry Chamberlain was the 76th Governor of South Carolina from 1874.',
'Daniel Henry Chamberlain was the 76th Governor of South Carolina, beginning in 1874.',
'Daniel Henry Chamberlain was the 76th Governor of South Carolina who took office in 1874.'],
'sentence_annotations': [{'final_sentence': 'Daniel Henry Chamberlain was the 76th Governor of South Carolina from 1874.',
'original_sentence': 'Daniel Henry Chamberlain (June 23, 1835 – April 13, 1907) was an American planter, lawyer, author and the 76th Governor of South Carolina '
'from 1874 until 1877.',
'sentence_after_ambiguity': 'Daniel Henry Chamberlain was the 76th Governor of South Carolina from 1874.',
'sentence_after_deletion': 'Daniel Henry Chamberlain was the 76th Governor of South Carolina from 1874.'},
...
],
'table': [[{'column_span': 1, 'is_header': True, 'row_span': 1, 'value': '#'},
{'column_span': 2, 'is_header': True, 'row_span': 1, 'value': 'Governor'},
{'column_span': 1, 'is_header': True, 'row_span': 1, 'value': 'Took Office'},
{'column_span': 1, 'is_header': True, 'row_span': 1, 'value': 'Left Office'}],
[{'column_span': 1, 'is_header': True, 'row_span': 1, 'value': '74'},
{'column_span': 1, 'is_header': False, 'row_span': 1, 'value': '-'},
{'column_span': 1, 'is_header': False, 'row_span': 1, 'value': 'Robert Kingston Scott'},
{'column_span': 1, 'is_header': False, 'row_span': 1, 'value': 'July 6, 1868'}],
...
],
'table_page_title': 'List of Governors of South Carolina',
'table_section_text': 'Parties Democratic Republican',
'table_section_title': 'Governors under the Constitution of 1868',
'table_webpage_url': 'http://en.wikipedia.org/wiki/List_of_Governors_of_South_Carolina',
'target': 'Daniel Henry Chamberlain was the 76th Governor of South Carolina from 1874.',
'totto_id': 0}
web_nlg_en
- Size of downloaded dataset files: 12.95 MB
- Size of the generated dataset: 14.63 MB
- Total amount of disk used: 27.57 MB
An example of validation
looks as follows.
{'category': 'Airport',
'gem_id': 'web_nlg_en-validation-0',
'input': ['Aarhus | leader | Jacob_Bundsgaard'],
'references': ['The leader of Aarhus is Jacob Bundsgaard.'],
'target': 'The leader of Aarhus is Jacob Bundsgaard.',
'webnlg_id': 'dev/Airport/1/Id1'}
web_nlg_ru
- Size of downloaded dataset files: 7.63 MB
- Size of the generated dataset: 8.41 MB
- Total amount of disk used: 16.04 MB
An example of validation
looks as follows.
{'category': 'Airport',
'gem_id': 'web_nlg_ru-validation-0',
'input': ['Punjab,_Pakistan | leaderTitle | Provincial_Assembly_of_the_Punjab'],
'references': ['Пенджаб, Пакистан, возглавляется Провинциальной ассамблеей Пенджаба.', 'Пенджаб, Пакистан возглавляется Провинциальной ассамблеей Пенджаба.'],
'target': 'Пенджаб, Пакистан, возглавляется Провинциальной ассамблеей Пенджаба.',
'webnlg_id': 'dev/Airport/1/Id1'}
wiki_auto_asset_turk
- Size of downloaded dataset files: 127.27 MB
- Size of the generated dataset: 152.77 MB
- Total amount of disk used: 280.04 MB
An example of validation
looks as follows.
{'gem_id': 'wiki_auto_asset_turk-validation-0',
'references': ['The Gandalf Awards honor excellent writing in in fantasy literature.'],
'source': 'The Gandalf Awards, honoring achievement in fantasy literature, were conferred by the World Science Fiction Society annually from 1974 to 1981.',
'source_id': '350_691837-1-0-0',
'target': 'The Gandalf Awards honor excellent writing in in fantasy literature.',
'target_id': '350_691837-0-0-0'}
wiki_lingua_es_en
- Size of downloaded dataset files: 169.41 MB
- Size of the generated dataset: 287.60 MB
- Total amount of disk used: 457.01 MB
An example of validation
looks as follows.
'references': ["Practice matted hair prevention from early in your cat's life. Make sure that your cat is grooming itself effectively. Keep a close eye on cats with long hair."],
'source': 'Muchas personas presentan problemas porque no cepillaron el pelaje de sus gatos en una etapa temprana de su vida, ya que no lo consideraban necesario. Sin embargo, a medida que...',
'target': "Practice matted hair prevention from early in your cat's life. Make sure that your cat is grooming itself effectively. Keep a close eye on cats with long hair."}
wiki_lingua_ru_en
- Size of downloaded dataset files: 169.41 MB
- Size of the generated dataset: 211.21 MB
- Total amount of disk used: 380.62 MB
An example of validation
looks as follows.
{'gem_id': 'wiki_lingua_ru_en-val-0',
'references': ['Get immediate medical care if you notice signs of a complication. Undergo diagnostic tests to check for gallstones and complications. Ask your doctor about your treatment '
'options.'],
'source': 'И хотя, скорее всего, вам не о чем волноваться, следует незамедлительно обратиться к врачу, если вы подозреваете, что у вас возникло осложнение желчекаменной болезни. Это ...',
'target': 'Get immediate medical care if you notice signs of a complication. Undergo diagnostic tests to check for gallstones and complications. Ask your doctor about your treatment '
'options.'}
wiki_lingua_tr_en
- Size of downloaded dataset files: 169.41 MB
- Size of the generated dataset: 10.35 MB
- Total amount of disk used: 179.75 MB
An example of validation
looks as follows.
{'gem_id': 'wiki_lingua_tr_en-val-0',
'references': ['Open Instagram. Go to the video you want to download. Tap ⋮. Tap Copy Link. Open Google Chrome. Tap the address bar. Go to the SaveFromWeb site. Tap the "Paste Instagram Video" text box. Tap and hold the text box. Tap PASTE. Tap Download. Download the video. Find the video on your Android.'],
'source': 'Instagram uygulamasının çok renkli kamera şeklindeki simgesine dokun. Daha önce giriş yaptıysan Instagram haber kaynağı açılır. Giriş yapmadıysan istendiğinde e-posta adresini ...',
'target': 'Open Instagram. Go to the video you want to download. Tap ⋮. Tap Copy Link. Open Google Chrome. Tap the address bar. Go to the SaveFromWeb site. Tap the "Paste Instagram Video" text box. Tap and hold the text box. Tap PASTE. Tap Download. Download the video. Find the video on your Android.'}
wiki_lingua_vi_en
- Size of downloaded dataset files: 169.41 MB
- Size of the generated dataset: 41.02 MB
- Total amount of disk used: 210.43 MB
An example of validation
looks as follows.
{'gem_id': 'wiki_lingua_vi_en-val-0',
'references': ['Select the right time of year for planting the tree. You will usually want to plant your tree when it is dormant, or not flowering, during cooler or colder times of year.'],
'source': 'Bạn muốn cung cấp cho cây cơ hội tốt nhất để phát triển và sinh tồn. Trồng cây đúng thời điểm trong năm chính là yếu tố then chốt. Thời điểm sẽ thay đổi phụ thuộc vào loài cây ...',
'target': 'Select the right time of year for planting the tree. You will usually want to plant your tree when it is dormant, or not flowering, during cooler or colder times of year.'}
xsum
- Size of downloaded dataset files: 254.89 MB
- Size of the generated dataset: 70.67 MB
- Total amount of disk used: 325.56 MB
An example of validation
looks as follows.
{'document': 'Burberry reported pre-tax profits of £166m for the year to March. A year ago it made a loss of £16.1m, hit by charges at its Spanish operations.\n'
'In the past year it has opened 21 new stores and closed nine. It plans to open 20-30 stores this year worldwide.\n'
'The group has also focused on promoting the Burberry brand online...',
'gem_id': 'xsum-validation-0',
'references': ['Luxury fashion designer Burberry has returned to profit after opening new stores and spending more on online marketing'],
'target': 'Luxury fashion designer Burberry has returned to profit after opening new stores and spending more on online marketing',
'xsum_id': '10162122'}
Data Fields
The data fields are the same among all splits.
common_gen
gem_id
: astring
feature.concept_set_id
: aint32
feature.concepts
: alist
ofstring
features.target
: astring
feature.references
: alist
ofstring
features.
cs_restaurants
gem_id
: astring
feature.dialog_act
: astring
feature.dialog_act_delexicalized
: astring
feature.target_delexicalized
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
dart
gem_id
: astring
feature.dart_id
: aint32
feature.tripleset
: alist
ofstring
features.subtree_was_extended
: abool
feature.target_sources
: alist
ofstring
features.target
: astring
feature.references
: alist
ofstring
features.
e2e_nlg
gem_id
: astring
feature.meaning_representation
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
mlsum_de
gem_id
: astring
feature.text
: astring
feature.topic
: astring
feature.url
: astring
feature.title
: astring
feature.date
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
mlsum_es
gem_id
: astring
feature.text
: astring
feature.topic
: astring
feature.url
: astring
feature.title
: astring
feature.date
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
schema_guided_dialog
gem_id
: astring
feature.act
: a classification label, with possible values includingAFFIRM
(0),AFFIRM_INTENT
(1),CONFIRM
(2),GOODBYE
(3),INFORM
(4).slot
: astring
feature.values
: alist
ofstring
features.dialog_id
: astring
feature.turn_id
: aint32
feature.prompt
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
totto
gem_id
: astring
feature.totto_id
: aint32
feature.table_page_title
: astring
feature.table_webpage_url
: astring
feature.table_section_title
: astring
feature.table_section_text
: astring
feature.column_span
: aint32
feature.is_header
: abool
feature.row_span
: aint32
feature.value
: astring
feature.highlighted_cells
: alist
ofint32
features.example_id
: astring
feature.original_sentence
: astring
feature.sentence_after_deletion
: astring
feature.sentence_after_ambiguity
: astring
feature.final_sentence
: astring
feature.overlap_subset
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
web_nlg_en
gem_id
: astring
feature.input
: alist
ofstring
features.target
: astring
feature.references
: alist
ofstring
features.category
: astring
feature.webnlg_id
: astring
feature.
web_nlg_ru
gem_id
: astring
feature.input
: alist
ofstring
features.target
: astring
feature.references
: alist
ofstring
features.category
: astring
feature.webnlg_id
: astring
feature.
wiki_auto_asset_turk
gem_id
: astring
feature.source_id
: astring
feature.target_id
: astring
feature.source
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
wiki_lingua_es_en
gem_id
: astring
feature.source
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
wiki_lingua_ru_en
gem_id
: astring
feature.source
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
wiki_lingua_tr_en
gem_id
: astring
feature.source
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
wiki_lingua_vi_en
gem_id
: astring
feature.source
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
xsum
gem_id
: astring
feature.xsum_id
: astring
feature.document
: astring
feature.target
: astring
feature.references
: alist
ofstring
features.
Data Splits
common_gen
train | validation | test | |
---|---|---|---|
common_gen | 67389 | 993 | 1497 |
cs_restaurants
train | validation | test | |
---|---|---|---|
cs_restaurants | 3569 | 781 | 842 |
dart
train | validation | test | |
---|---|---|---|
dart | 62659 | 2768 | 6959 |
e2e_nlg
train | validation | test | |
---|---|---|---|
e2e_nlg | 33525 | 4299 | 4693 |
mlsum_de
train | validation | test | |
---|---|---|---|
mlsum_de | 220748 | 11392 | 10695 |
mlsum_es
train | validation | test | |
---|---|---|---|
mlsum_es | 259886 | 9977 | 13365 |
schema_guided_dialog
train | validation | test | |
---|---|---|---|
schema_guided_dialog | 164982 | 10000 | 10000 |
totto
train | validation | test | |
---|---|---|---|
totto | 121153 | 7700 | 7700 |
web_nlg_en
train | validation | test | |
---|---|---|---|
web_nlg_en | 35426 | 1667 | 1779 |
web_nlg_ru
train | validation | test | |
---|---|---|---|
web_nlg_ru | 14630 | 790 | 1102 |
wiki_auto_asset_turk
train | validation | test_asset | test_turk | |
---|---|---|---|---|
wiki_auto_asset_turk | 373801 | 73249 | 359 | 359 |
wiki_lingua_es_en
train | validation | test | |
---|---|---|---|
wiki_lingua_es_en | 79515 | 8835 | 19797 |
wiki_lingua_ru_en
train | validation | test | |
---|---|---|---|
wiki_lingua_ru_en | 36898 | 4100 | 9094 |
wiki_lingua_tr_en
train | validation | test | |
---|---|---|---|
wiki_lingua_tr_en | 3193 | 355 | 808 |
wiki_lingua_vi_en
train | validation | test | |
---|---|---|---|
wiki_lingua_vi_en | 9206 | 1023 | 2167 |
xsum
train | validation | test | |
---|---|---|---|
xsum | 23206 | 1117 | 1166 |
Dataset Creation
Curation Rationale
Source Data
Initial Data Collection and Normalization
Who are the source language producers?
Annotations
Annotation process
Who are the annotators?
Personal and Sensitive Information
Considerations for Using the Data
Social Impact of Dataset
Discussion of Biases
Other Known Limitations
Additional Information
Dataset Curators
Licensing Information
CC-BY-SA-4.0
Citation Information
@article{gem_benchmark,
author = {Sebastian Gehrmann and
Tosin P. Adewumi and
Karmanya Aggarwal and
Pawan Sasanka Ammanamanchi and
Aremu Anuoluwapo and
Antoine Bosselut and
Khyathi Raghavi Chandu and
Miruna{-}Adriana Clinciu and
Dipanjan Das and
Kaustubh D. Dhole and
Wanyu Du and
Esin Durmus and
Ondrej Dusek and
Chris Emezue and
Varun Gangal and
Cristina Garbacea and
Tatsunori Hashimoto and
Yufang Hou and
Yacine Jernite and
Harsh Jhamtani and
Yangfeng Ji and
Shailza Jolly and
Dhruv Kumar and
Faisal Ladhak and
Aman Madaan and
Mounica Maddela and
Khyati Mahajan and
Saad Mahamood and
Bodhisattwa Prasad Majumder and
Pedro Henrique Martins and
Angelina McMillan{-}Major and
Simon Mille and
Emiel van Miltenburg and
Moin Nadeem and
Shashi Narayan and
Vitaly Nikolaev and
Rubungo Andre Niyongabo and
Salomey Osei and
Ankur P. Parikh and
Laura Perez{-}Beltrachini and
Niranjan Ramesh Rao and
Vikas Raunak and
Juan Diego Rodriguez and
Sashank Santhanam and
Jo{\~{a}}o Sedoc and
Thibault Sellam and
Samira Shaikh and
Anastasia Shimorina and
Marco Antonio Sobrevilla Cabezudo and
Hendrik Strobelt and
Nishant Subramani and
Wei Xu and
Diyi Yang and
Akhila Yerukola and
Jiawei Zhou},
title = {The {GEM} Benchmark: Natural Language Generation, its Evaluation and
Metrics},
journal = {CoRR},
volume = {abs/2102.01672},
year = {2021},
url = {https://arxiv.org/abs/2102.01672},
archivePrefix = {arXiv},
eprint = {2102.01672}
}
Contributions
Thanks to @yjernite for adding this dataset.
- Downloads last month
- 8,671