-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathstratified_random_sampling.py
More file actions
executable file
·59 lines (52 loc) · 1.58 KB
/
stratified_random_sampling.py
File metadata and controls
executable file
·59 lines (52 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: UTF-8 -*-
# create 10000 sample tracks according to the genre proportion
from __future__ import division
import pymysql
import random
config = {
'host': 'localhost',
'user': 'root',
'password': '960513',
'database': 'recommenderSystem'
}
# straifiled random sample
def stratified_random_sampling(output_sample_num):
songs= []
genresCount = {}
genreSongs = {}
db = pymysql.connect(**config)
cur = db.cursor()
try:
cur.execute('use recommenderSystem')
cur.execute("select trackID, majorityGenre from sub_tracks")
songs = cur.fetchall()
except Exception as e:
print(e)
print("songs load finished")
print(len(songs))
for song in songs:
genresCount.setdefault(song[1],0)
genresCount[song[1]] += 1
genreSongs.setdefault(song[1],[])
genreSongs[song[1]].append(song[0])
# calculate the proportion
songCount = len(songs)
sampleCount = 0
samples = []
for genre in genresCount:
genresCount[genre] = round(genresCount[genre] / songCount * output_sample_num)
sampleCount += genresCount[genre]
print(genresCount)
print(sampleCount)
for genre in genreSongs:
samples += random.sample(genreSongs[genre], genresCount[genre])
# store the sample tracks into the database
try:
cur.execute("create table sample_tracks like sub_tracks")
cur.execute("insert into sample_tracks select * from sub_tracks where trackID in %s", [samples])
cur.execute("alter table sample_tracks add primary key (trackID)")
db.commit()
except Exception as e:
print(e)
db.close()
stratified_random_sampling(10000)