bibliometrics/combined_repatha_fewpatents.sql at master · NETESOLUTIONS/bibliometrics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
-- batch file for repatha test case some stages procedures are quite manual
-- George Chacko 4/14/2016
-- modified on 4/30/2016 to include only primary patents (denoted by fp for few patents)

--PATENT BRANCH
-- Find relevant patent numbers by the method of Google
-- then search for these number in uspto_patents and Derwent tables
-- select patent_num from uspto_patents where patent_num like '%8871913' or patent_num like '%8871914';
-- returns
-- patent_num
-- ------------
--  08871913
--  08871914
-- search aaain with correct numbers for cited patents and cited literature
-- run this script as 'psql -d pardi < combined_repatha.sql > repathalog;'

\echo create patents awarded to drug***
drop table if exists fp_temp_repatha_patents_g1;
create table fp_temp_repatha_patents_g1 (drug varchar, patent_num_orig varchar);
insert into fp_temp_repatha_patents_g1 values ('Repatha','08871913');
insert into fp_temp_repatha_patents_g1 values ('Repatha','08871914');

\echo retrieve wosids for patents***
drop table if exists fp_temp_repatha_patents_wos;
create table fp_temp_repatha_patents_wos as select * from wos_patent_mapping
where patent_orig in (select patent_num_orig from fp_temp_repatha_patents_g1);

\echo joining on wos_pmid_mapping to get pmids **
drop table if exists fp_temp_repatha_patents_pmid;
create table fp_temp_repatha_patents_pmid as
select a.patent_num,a.wos_id,b.pmid_int from fp_temp_repatha_patents_wos a
LEFT JOIN wos_pmid_mapping b on a.wos_id=b.wos_uid;

-- LITERATURE BRANCH
-- begin with drug or biologic name, e.g. repatha aka evolocumab aka AMG145
-- assemble seedset of pmids by scraping from FDA approval documents and identifying corresponding pmids
-- use R script is seedset.R to get back structured data from eutuils, the source file is ev_fda_foundational,
-- the output is ev_fda_seedset

-- load seedset of 59 pmids from R environment exported as repatha_seedset.csv

\echo loading seedset pmids ***
drop table if exists fp_temp_repatha1;
create table fp_temp_repatha1 (sno int, uid int, pubdate varchar, lastauthor varchar, source varchar, title varchar, year int);
copy fp_temp_repatha1 from '/tmp/seedset.csv' DELIMITER ',' CSV HEADER;

-- create CT to pmid table (temp_repatha_ct)
\echo search for repatha in CT tables in PARDI and identify pmids from cited references in ct_references***
drop table if exists fp_temp_repatha_ct;
create table fp_temp_repatha_ct as select nct_id,pmid from ct_references where nct_id in (select nct_id from ct_interventions
where lower(intervention_name) ='evolocumab' or lower(intervention_name) like 'repatha');

--create first generation pmid list (citg1)
\echo creating first generation pmid list***
drop table if exists fp_temp_repatha_citg1;
create table fp_temp_repatha_citg1 (source varchar, citg1 int);
insert into fp_temp_repatha_citg1 select patent_num, pmid_int  from fp_temp_repatha_patents_pmid;
insert into fp_temp_repatha_citg1 select nct_id,pmid from fp_temp_repatha_ct;
insert into fp_temp_repatha_citg1 select 'seedset',uid from fp_temp_repatha1 where uid not in (select pmid from fp_temp_repatha_ct);
--map citg1 to SPIRES for

drop table if exists fp_temp_repatha_citg1_spires;
create table fp_temp_repatha_citg1_spires as select a.*,b.full_project_num_dc,b.admin_phs_org_code,b.match_case,
b.external_org_id,b.index_name from fp_temp_repatha_citg1 a LEFT JOIN spires_pub_projects b on a.citg1=b.pmid;

\echo creating second generation pmid list***

--get wos ids
drop table if exists fp_temp_repatha2;
create table fp_temp_repatha2 as select a.*,b.wos_uid from fp_temp_repatha_citg1 a LEFT JOIN wos_pmid_mapping b on a.citg1=b.pmid_int;
--- in this case wos_uids were not returned for 6 pmids so the table was manually edited to include them based on
--a GUI search
\echo adding manually discovered wos_uids by GUI search***
update fp_temp_repatha2 set wos_uid  = 'WOS:000084376400009'  where citg1  = 10712828;
update fp_temp_repatha2 set wos_uid  = 'WOS:000338999800004'  where citg1  = 25014686;
update fp_temp_repatha2 set wos_uid  = 'WOS:000367007300004'  where citg1  = 26696675;
update fp_temp_repatha2 set wos_uid  = 'WOS:000306270800036'  where citg1  = 22085343;
update fp_temp_repatha2 set wos_uid  = 'WOS:000277311200030'  where citg1  = 20228404;
update fp_temp_repatha2 set wos_uid  = 'WOS:000284451000032'  where citg1  = 21067804;
-- get cited references
set enable_seqscan to 'off';
drop table if exists fp_temp_repatha3;
create table fp_temp_repatha3 as select a.*,b.cited_source_uid from fp_temp_repatha2 a
LEFT JOIN wos_references b on a.wos_uid=b.source_id;

\echo cleaning wos ids***
-- clean WOS IDs.
\echo Cleaning WOS IDs on temp_repatha3...***
update fp_temp_repatha3
  set cited_source_uid =
  (
    case when cited_source_uid like 'WOS%'
           then substring(cited_source_uid, 1, 19)
         when cited_source_uid like 'MED%' or cited_source_uid like 'NON%' or
         cited_source_uid like 'CSC%' or cited_source_uid like 'INS%' or
         cited_source_uid like 'BCI%' or cited_source_uid=''
           then cited_source_uid
         else substring('WOS:'||cited_source_uid, 1, 19)
    end
  );

copy (select * from fp_temp_repatha3) to '/tmp/tr3.csv' DELIMITER ',' CSV HEADER;

\echo mapping back to pmids***
drop table if exists fp_temp_repatha4;
create table fp_temp_repatha4 as  select a.citg1 as input_pmid,a.wos_uid,a.cited_source_uid,b.pmid,b.pmid_int
as pmid_output from fp_temp_repatha3 a LEFT JOIN wos_pmid_mapping b on a.cited_source_uid=b.wos_uid;

-- pick up the extra MEDLINE ones as well
update fp_temp_repatha4 set pmid=cited_source_uid where substring(cited_source_uid,1,8)='MEDLINE:';
update fp_temp_repatha4 set pmid_output=substring(pmid,9)::int  where substring(cited_source_uid,1,8)='MEDLINE:';

-- get SPIRES data for Gen2
\echo mapping pmid output to SPIRES for grants data***
drop table if exists fp_temp_repatha_citg2_spires;
create table fp_temp_repatha_citg2_spires as select a.*,b.full_project_num_dc,b.admin_phs_org_code,b.match_case,
b.external_org_id,b.index_name from fp_temp_repatha4 a
LEFT JOIN spires_pub_projects b on a.pmid_output=b.pmid;

\echo export all relevant tables to /tmp***
copy(select * from fp_temp_repatha_patents_pmid) to '/tmp/fp_temp_repatha_patents_pmid.csv' DELIMITER ',' CSV HEADER;
copy(select * from fp_temp_repatha_citg1_spires) to '/tmp/fp_temp_repatha_citg1_spires.csv' DELIMITER ',' CSV HEADER;
copy(select * from fp_temp_repatha_citg2_spires) to '/tmp/fp_temp_repatha_citg2_spires.csv' DELIMITER ',' CSV HEADER;