-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path0-A-create-databases.R
More file actions
118 lines (80 loc) · 4.22 KB
/
0-A-create-databases.R
File metadata and controls
118 lines (80 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# ---------------------------------------------------------------------------------
# The script merge product/prices/stores databases and add variety information
# ---------------------------------------------------------------------------------
#### ----------- Common stuff --------------------
### PART 1: LOAD PRICE DATABASE
# Libraries needed
library(data.table)
library(dplyr)
library(readxl)
## load price database
dbase <- fread("C://Users/leandro/Dropbox/Docs/Investigacion/2016.Distance and quality/Bases/2020.Finales/2020-dbase-original.csv", data.table = F)
dbase <- fread("~/Dropbox/Docs/Investigacion/2016.Distance and quality/Bases/2020.Finales/2020-dbase-original.csv", data.table = F)
head(dbase)
# Database information --
# Super: store id number
# Product: product id number
# Year: year
# Month: month
# Time: year/month variable (1 being 2007/04, 90 being 2014/09)
# PMode: monthly mode of daily price
table(dbase$Product)
table(dbase$Year)
dbase$moda <- log(dbase$PMode) *100 # Add logs of prices and * 100
dbase$PMode <- NULL
#### Add information of Category ---------------------------
# Product information is on a separate file
products <- read_excel("~/Dropbox/Docs/Investigacion/2016.Distance and quality/Bases/2020.Finales/products.xls")
products <- read_excel("c://Users/leandro/Dropbox/Docs/Investigacion/2016.Distance and quality/Bases/2020.Finales/products.xls")
head(products)
categ <- distinct(products[, c(1,6,9,13)], id, Market, Used,`IPC-class`)
# Database information (products) --
# id: product id number identifier
# Category: the product category of the good
# Used: 1 if keep, 0 is skiped
# IPC-class: broad category to construct instrument
colnames(categ) <- c("Product","Category","Used","IPC.class")
dbase <- merge(dbase, categ, by= "Product")
head(dbase)
# Restrict database
dbase <- dbase[dbase$Used ==1,]
dbase$Used <- NULL
#### Count number of varieties ---------------------
# For each product, count the number of prices in the same category, less 1
dbase$Variety <- 0
dbase$Variety <- with(dbase, ave(Product, Category, Time, Super, FUN= length)) - 1
#### Add common supermarket information -------------
## Load database
supers <- read.csv("~/Dropbox/Docs/Investigacion/2016.Distance and quality/Bases/2020.Finales/Establecimientos.csv", header = T, sep = ',')
supers <- read.csv("C:/Users/leandro/Dropbox/Docs/Investigacion/2016.Distance and quality/Bases/2020.Finales/Establecimientos.csv") #windows
head(supers)
supers <- supers[, c("Super", "chain.number", "city.number", "depto.number", "X_UTM", "Y_UTM","ccz")] ## the information needed to merge
# Database information (products) --
# Super: store id number identifier
# chain.number: an id of the chain to which the store belongs
# city.number: the id of the city where the store is located
# X_UTM: the location in the easting UTM
# Y_UTM: the location in the northing UTM
# ccz: political location in Montevideo
supers <- supers[order(as.numeric(supers$Super)) ,] # order by supermarket
######## Supermarkets
## Check if all supermarkets are in price frame
supers.supers <- unique(supers$Super)
supers.dbase <- unique(dbase$Super)
'%!in%' <- function(x,y)!('%in%'(x,y)) ## Define function for those not in other vector
not.in.supers <- setdiff(supers.dbase, supers.supers) # supermarkets in price, but not in supers
not.in.prices <- setdiff(supers.supers, supers.dbase) # we do not have information for three supermarkets
rm(supers.dbase, supers.supers)
# Supermarket 386 is repeated, delete it
supers <- supers[supers$Super != 386,]
## Merge databases
# Exclude those supermarkets not in both bases. If want to include only supermarkets
# in price base, include all.x = TRUE
dbase <- merge(dbase, supers, by = "Super") # Merge cola and supermarket bases
rm(supers, not.in.prices, not.in.supers)
## Export databases
write.csv(dbase, "C:/Users/leandro/Dropbox/Docs/Investigacion/2016.Distance and quality/Bases/2020.Finales/2020-dbase-levels-final.csv", row.names = FALSE)
write.csv(dbase, "~/Dropbox/Docs/Investigacion/2016.Distance and quality/Bases/2020.Finales/2020-dbase-levels-final.csv", row.names = FALSE)
### Start next script ----------------
source("/path/to/file/0-B-difference-database-2020.R")
#### End of script ---------------------------------------