-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload-sentences.php
More file actions
executable file
·125 lines (108 loc) · 2.54 KB
/
load-sentences.php
File metadata and controls
executable file
·125 lines (108 loc) · 2.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env php
<?php
declare(strict_types=1);
require_once __DIR__.'/vendor/autoload.php';
ini_set('memory_limit', '24G');
$db_file = $argv[1] ?? 'sentences.sqlite';
$db = new \TDC\PDO\SQLite($db_file);
$db->exec("PRAGMA journal_mode = delete");
$db->exec("PRAGMA page_size = 65536");
$db->exec("VACUUM");
$db->exec("PRAGMA auto_vacuum = INCREMENTAL");
$db->exec("PRAGMA case_sensitive_like = ON");
$db->exec("PRAGMA foreign_keys = OFF");
$db->exec("PRAGMA ignore_check_constraints = ON");
$db->exec("PRAGMA journal_mode = MEMORY");
$db->exec("PRAGMA locking_mode = EXCLUSIVE");
$db->exec("PRAGMA synchronous = OFF");
$db->exec("PRAGMA threads = 4");
$db->exec("PRAGMA trusted_schema = OFF");
$db->exec("CREATE TABLE sents (
s_id INTEGER NOT NULL,
s_tokens INTEGER NOT NULL,
s_text TEXT NOT NULL,
PRIMARY KEY (s_id)
) WITHOUT ROWID");
$db->beginTransaction();
$ins = $db->prepare("INSERT INTO sents (s_id, s_tokens, s_text) VALUES (?, ?, ?)");
$uniq = [];
$in_par = false;
$tokens = 0;
$sent = '';
$cohort = '';
$i = 0;
function handle_cohort() {
global $tokens, $sent, $cohort;
if (!empty($cohort)) {
if (!preg_match('~^"<(.+?)>"~u', $cohort, $m)) {
echo "BAD COHORT: $cohort\n";
$cohort = '';
return;
}
$word = $m[1];
if (strpos($cohort, ' Prop') === false && strpos($cohort, ' ?') === false) {
$word = mb_strtolower($word);
}
$sent .= $word;
$sent .= ' ';
++$tokens;
}
$cohort = '';
}
function save_sent() {
global $db, $ins, $uniq, $tokens, $sent, $i;
handle_cohort();
$sent = trim($sent);
if (empty($sent)) {
return;
}
$hash = sha1($sent);
if (!array_key_exists($hash, $uniq)) {
$uniq[$hash] = ++$i;
$ins->execute([$i, $tokens, $sent]);
if ($i % 10000 === 0) {
echo "$i\r";
$db->commit();
$db->beginTransaction();
}
}
$sent = '';
$tokens = 0;
}
while ($line = fgets(STDIN)) {
if (preg_match('~^<s(\d+)>~', $line, $m)) {
$in_par = intval($m[1]);
}
else if (preg_match('~^</s(\d+)>~', $line, $m)) {
if (intval($m[1]) !== $in_par) {
echo "MISMATCH: {$m[1]} != {$in_par}\n";
$in_par = false;
$tokens = 0;
$sent = '';
continue;
}
save_sent();
$in_par = false;
$tokens = 0;
$sent = '';
}
else if ($in_par) {
if (preg_match('~^"<~u', $line)) {
handle_cohort();
++$tokens;
$cohort = $line;
}
else if (preg_match('~^\s+"~u', $line)) {
$cohort .= $line;
}
else {
// Empty line means sentence break
save_sent();
}
}
}
save_sent();
echo "$i\n";
$db->commit();
$db->exec("PRAGMA ignore_check_constraints = OFF");
$db->exec("PRAGMA locking_mode = NORMAL");