lta/AlchemistCoder.yaml at preview · datalets/lta · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
---
# Thank you for contributing!
# In filling out this yaml file, please follow the criteria as described here:
# https://osai-index.eu/contribute

# You're free to build on this work and reuse the data. It is licensed under CC-BY 4.0, with the
# stipulation that attribution should come in the form of a link to https://osai-index.eu/
# and a citation to the peer-reviewed paper in which the dataset & criteria were published:

# Liesenfeld, A. and Dingemanse, M., 2024. Rethinking open source generative AI: open-washing and the EU AI Act. In Proceedings of the 2024 ACM Conference on Fairness, Accountability, and Transparency (pp. 1774-1787).

# Organization tags:
# - National origin: China
# - Contributor type: Academic (Research institution)

system:
    name: AlchemistCoder
    link: https://huggingface.co/internlm/AlchemistCoder-DS-6.7B
    type: code
    performanceclass: full
    basemodelname: DeepSeek-Coder-6.7B-Base
    endmodelname: AlchemistCoder-DS-6.7B
    endmodellicense: Apache-2.0
    releasedate: 2024-05
    notes: Open model trained by harmonizing different data sources. Multiple versions exist with different base models.

org:
    name: Shanghai AI Laboratory
    link: https://www.shlab.org.cn/
    notes: National-level Chinese research institute.

# availability:
datasources_basemodel:
    class: closed
    link: https://arxiv.org/pdf/2401.14196
    notes: GitHub is mentioned as a primary source for code data. For the rest the data mixture is left abstract.

datasources_endmodel:
    class: partial
    link: https://arxiv.org/pdf/2405.19265
    notes: The model makes use of both regular open-source data and synthetic data. Though the open-source data is outlined in the paper, the synthetic data generated is not provided.

weights_basemodel:
    class: open
    link: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
    notes: Weights available through HuggingFace.

weights_endmodel:
    class: open
    link: https://huggingface.co/internlm/AlchemistCoder-DS-6.7B
    notes: Weights available through HuggingFace.

trainingcode:
    class: closed
    link: https://github.com/InternLM/AlchemistCoder/
    notes: A repository exists which purportedly contains source code. However, this repository contains no code.

# documentation:
code:
    class: closed
    link: https://github.com/InternLM/AlchemistCoder/
    notes: No code available.

hardware_architecture:
    class: closed
    link:
    notes: No hardware architecture outlined.

preprint:
    class: open
    link: https://arxiv.org/pdf/2405.19265
    notes: Preprint made available on arXiv.

paper:
    class: open
    link: https://dl.acm.org/doi/abs/10.5555/3737916.3737987
    notes: Paper published in NIPS.

modelcard:
    class: closed
    link: https://huggingface.co/internlm/AlchemistCoder-DS-6.7B
    notes: Model card contains some information, mainly describing the model and providing usage instructions.

datasheet:
    class: partial
    link: ["https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1", "https://huggingface.co/datasets/codefuse-ai/CodeExercise-Python-27k", "https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1"]
    notes: Data sheets available for some data sources, however synthetic data is not made publicly available.

# access:
package:
    class: closed
    link:
    notes: No package found.

api:
    class: closed
    link:
    notes: No API found.
    metaprompt: closed

licenses:
    class: open
    link: https://huggingface.co/datasets/choosealicense/licenses/blob/main/markdown/apache-2.0.md
    notes: Model licensed under Apache-2.0.