galleries/dataset-gallery.json at main · transformerlab/galleries · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
[
    {
        "id": 1,
        "name": "Alpaca Data",
        "type": "Conversational",
        "huggingfacerepo": "tatsu-lab/alpaca",
        "description": "Contains 52K instruction-following data used for fine-tuning the Alpaca model.",
        "size": 24246638,
        "license": "GPL"
    },
    {
        "id": 2,
        "name": "Simpsons Blip Captions",
        "type": "Text-to-Image",
        "huggingfacerepo": "Norod78/simpsons-blip-captions",
        "description": "A dataset of 755 images from the Simpsons TV show with captions.",
        "size": 50553165,
        "license": "CC BY-NC 4.0"
    },
    {
        "id": 3,
        "name": "Databricks Dolly 15k",
        "type": "Conversational",
        "huggingfacerepo": "databricks/databricks-dolly-15k",
        "description": "databricks-dolly-15k is an open source dataset of instruction-following records generated by thousands of Databricks employees in several of the behavioral categories outlined in the InstructGPT paper, including brainstorming, classification, closed QA, generation, information extraction, open QA, and summarization.",
        "size": 13085339,
        "license": "GPL"
    },
    {
        "id": 4,
        "name": "SAMSum Corpus",
        "type": "Summarization",
        "huggingfacerepo": "knkarthick/samsum",
        "description": "The SAMSum dataset contains about 16k messenger-like conversations with summaries. Conversations were created and written down by linguists fluent in English. Linguists were asked to create conversations similar to those they write on a daily basis, reflecting the proportion of topics of their real-life messenger convesations. The style and register are diversified - conversations could be informal, semi-formal or formal, they may contain slang words, emoticons and typos. Then, the conversations were annotated with summaries. It was assumed that summaries should be a concise brief of what people talked about in the conversation in third person. The SAMSum dataset was prepared by Samsung R&D Institute Poland and is distributed for research purposes (non-commercial licence: CC BY-NC-ND 4.0).",
        "size": 10281491,
        "license": "CC BY-NC 4.0"
    },
    {
        "id": 5,
        "name": "PKU-SafeRLHF",
        "type": "Text-Generation",
        "huggingfacerepo": "PKU-Alignment/PKU-SafeRLHF",
        "description": "The preference dataset consists of 30k+ expert comparison data. Each entry in this dataset includes two responses to a question, along with safety meta-labels and preferences for both responses, taking into consideration their helpfulness and harmlessness. The dataset is designed to help train models to generate safe and helpful responses.",
        "size": 232675755,
        "license": "CC BY-NC 4.0"
    },
    {
        "id": 6,
        "name": "Anthropic RLHF",
        "type": "Text-Generation",
        "huggingfacerepo": "Anthropic/hh-rlhf",
        "description": "Human preference data about helpfulness and harmlessness from Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback. ",
        "size": 79254614,
        "license": "GPL"
    },
    {
        "id": 7,
        "name": "Nectar",
        "type": "Conversational",
        "huggingfacerepo": "berkeley-nest/Nectar",
        "description": "Nectar is the first high-quality 7-wise comparison dataset, generated through GPT-4-based ranking. Nectar contains diverse chat prompts, high-quality and diverse responses, and accurate ranking labels. Nectar's prompts are an amalgamation of diverse sources, including lmsys-chat-1M, ShareGPT, Anthropic/hh-rlhf, UltraFeedback, Evol-Instruct, and Flan. ",
        "size": 517351596,
        "license": "APACHE-2.0"
    },
    {
        "id": 8,
        "name": "OpenHermes",
        "type": "Text-Generation",
        "huggingfacerepo": "teknium/openhermes",
        "description": "The OpenHermes dataset is composed of 242,000 entries of primarily GPT-4 generated data.",
        "size": 328000487,
        "license": "GPL"
    },
    {
        "id": 9,
        "name": "MetaMathQA",
        "type": "Conversational",
        "huggingfacerepo": "meta-math/MetaMathQA",
        "description": "MetaMath-Mistral-7B is fully fine-tuned on the MetaMathQA datasets and based on the powerful Mistral-7B model. It is glad to see using MetaMathQA datasets and changing the base model from llama-2-7B to Mistral-7b can boost the GSM8K performance from 66.5 to 77.7.",
        "size": 395626321,
        "license": "MIT"
    },
    {
        "id": 10,
        "name": "OpenWebText 10k",
        "type": "Text-Generation",
        "huggingfacerepo": "stas/openwebtext-10k",
        "description": "10K slice of OpenWebText - An open-source replication of the WebText dataset from OpenAI.",
        "size": 14723792,
        "license": "GPL"
    },
    {
        "id": 11,
        "name": "INTELLECT-2 RL Dataset",
        "type": "Conversational",
        "huggingfacerepo": "PrimeIntellect/INTELLECT-2-RL-Dataset",
        "description": "Math and Coding tasks used for training the INTELLECT-2 model.",
        "size": 1730616738,
        "license": "GPL"
    },
    {
        "id": 12,
        "name": "Agentica DeepCoder Dataset",
        "type": "Conversational",
        "huggingfacerepo": "agentica-org/DeepCoder-Preview-Dataset",
        "description": "7.5K TACO Verified problems for training and evaluation of the Agentica DeepCoder model. ",
        "size": 862295065,
        "license": "GPL",
        "dataset_config": "taco"
    },
    {
        "id": 13,
        "name": "Wikipedia (20220301.en)",
        "type": "Text-Generation",
        "huggingfacerepo": "wikimedia/wikipedia",
        "description": "Wikipedia dataset containing cleaned articles of all languages. The datasets are built from the Wikipedia dump (https://dumps.wikimedia.org/) with one split per language. Each example contains the content of one full Wikipedia article with cleaning to strip markdown and unwanted sections (references, etc.).",
        "size": 11630929031,
        "license": "GPL",
        "dataset_config": "20231101.en"
    },
    {
        "id": 14,
        "name": "Touch Rugby Rules",
        "type": "Text-Generation",
        "huggingfacerepo": "Trelis/touch-rugby-rules",
        "description": "Touch Rugby Rules Dataset",
        "size": 104539,
        "license": "CC BY-NC 4.0"
    },
    {
        "id": 15,
        "name": "Text to Image Dataset Sample",
        "type": "Text-to-Image",
        "huggingfacerepo": "datasets-examples/doc-image-6",
        "description": "This dataset contains 4 jpeg files in the train/images/ subdirectory, along with a train/metadata.csv file that provides the data for other columns. The metadata file contains relative paths to the images.",
        "size": 227381,
        "license": "GPL"
    }
]