Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
238 changes: 119 additions & 119 deletions lib/google-refine.rb → lib/refine.rb
Original file line number Diff line number Diff line change
@@ -1,120 +1,120 @@
require 'httpclient'
require 'cgi'
require 'json'

class Refine
attr_reader :project_name
attr_reader :project_id

def self.get_all_project_metadata(server="http://127.0.0.1:3333")
uri = "#{server}/command/core/get-all-project-metadata"
response = HTTPClient.new(server).get(uri)
JSON.parse(response.body)
end

def initialize(opts = {})
@server = opts["server"] || "http://127.0.0.1:3333"
@throws_exceptions = opts["throws_exceptions"] || true

if opts["file_name"] && !opts["file_name"].empty? && opts["project_name"] && !opts["project_name"].empty?
project_name = CGI.escape(opts["project_name"])
@project_id = create_project(project_name, opts["file_name"])
@project_name = project_name if @project_id
else
@project_id = opts["project_id"]

metadata = self.get_project_metadata
@project_name = CGI.escape(metadata["name"])
end
end

def create_project(project_name, file_name)
uri = @server + "/command/core/create-project-from-upload"
project_id = false
File.open(file_name) do |file|
body = {
'project-file' => file,
'project-name' => project_name
}
response = client.post(uri, body)
url = response.header['Location']
unless url == []
project_id = CGI.parse(url[0].split('?')[1])['project'][0]
end
end
raise "Error creating project: #{response}" unless project_id
project_id
end

def apply_operations(file_name_or_string)
if File.exists?(file_name_or_string)
operations = File.read(file_name_or_string)
else
operations = file_name_or_string
end

call('apply-operations', 'operations' => file_name_or_string)
end

def export_rows(opts={})
format = opts["format"] || 'tsv'
uri = @server + "/command/core/export-rows/#{@project_name}.#{format}"

body = {
'engine' => {
"facets" => opts["facets"] || [],
"mode" => "row-based"
}.to_json,
'options' => opts["options"] || '',
'project' => @project_id,
'format' => format
}

@response = client.post(uri, body)
@response.content
end

def delete_project
uri = @server + "/command/core/delete-project"
body = {
'project' => @project_id
}
@response = client.post(uri, body)
JSON.parse(@response.content)['code'] rescue false
end

# this pattern is pulled from mailchimp/mailchimp-gem

def call(method, params = {})
uri = "#{@server}/command/core/#{method}"
params = { "project" => @project_id }.merge(params)

response = if method.start_with?('get-')
client.get(uri, params)
else
client.post(uri, params)
end

begin
response = JSON.parse(response.body)
rescue
response = JSON.parse('[' + response.body + ']').first
end

if @throws_exceptions && response.is_a?(Hash) && response["code"] && response["code"] == "error"
raise "API Error: #{response}"
end

response
end

def method_missing(method, *args)
# translate: get_column_info --> get-column-info
call(method.to_s.gsub('_', '-'), *args)
end

protected
def client
@client ||= HTTPClient.new(@server)
end
require 'httpclient'
require 'cgi'
require 'json'
class Refine
attr_reader :project_name
attr_reader :project_id
def self.get_all_project_metadata(server="http://127.0.0.1:3333")
uri = "#{server}/command/core/get-all-project-metadata"
response = HTTPClient.new(server).get(uri)
JSON.parse(response.body)
end
def initialize(opts = {})
@server = opts["server"] || "http://127.0.0.1:3333"
@throws_exceptions = opts["throws_exceptions"] || true
if opts["file_name"] && !opts["file_name"].empty? && opts["project_name"] && !opts["project_name"].empty?
project_name = CGI.escape(opts["project_name"])
@project_id = create_project(project_name, opts["file_name"])
@project_name = project_name if @project_id
else
@project_id = opts["project_id"]
metadata = self.get_project_metadata
@project_name = CGI.escape(metadata["name"])
end
end
def create_project(project_name, file_name)
uri = @server + "/command/core/create-project-from-upload"
project_id = false
File.open(file_name) do |file|
body = {
'project-file' => file,
'project-name' => project_name
}
response = client.post(uri, body)
url = response.header['Location']
unless url == []
project_id = CGI.parse(url[0].split('?')[1])['project'][0]
end
end
raise "Error creating project: #{response}" unless project_id
project_id
end
def apply_operations(file_name_or_string)
if File.exists?(file_name_or_string)
operations = File.read(file_name_or_string)
else
operations = file_name_or_string
end
call('apply-operations', 'operations' => operations)
end
def export_rows(opts={})
format = opts["format"] || 'tsv'
uri = @server + "/command/core/export-rows/#{@project_name}.#{format}"
body = {
'engine' => {
"facets" => opts["facets"] || [],
"mode" => "row-based"
}.to_json,
'options' => opts["options"] || '',
'project' => @project_id,
'format' => format
}
@response = client.post(uri, body)
@response.content
end
def delete_project
uri = @server + "/command/core/delete-project"
body = {
'project' => @project_id
}
@response = client.post(uri, body)
JSON.parse(@response.content)['code'] rescue false
end
# this pattern is pulled from mailchimp/mailchimp-gem
def call(method, params = {})
uri = "#{@server}/command/core/#{method}"
params = { "project" => @project_id }.merge(params)
response = if method.start_with?('get-')
client.get(uri, params)
else
client.post(uri, params)
end
begin
response = JSON.parse(response.body)
rescue
response = JSON.parse('[' + response.body + ']').first
end
if @throws_exceptions && response.is_a?(Hash) && response["code"] && response["code"] == "error"
raise "API Error: #{response}"
end
response
end
def method_missing(method, *args)
# translate: get_column_info --> get-column-info
call(method.to_s.gsub('_', '-'), *args)
end
protected
def client
@client ||= HTTPClient.new(@server)
end
end
4 changes: 4 additions & 0 deletions test/dates.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Date
7 December 2001
July 1 2002
10/20/10
109 changes: 109 additions & 0 deletions test/examples_of_usage.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
### NOTE: The internal client-server protocol used by OpenRefine is not yet maintained as a stable external API, subject to change. ###
### Therefore, plase indicate changes you notice to kittelmann@sub.uni-goettingen.de ###
### Some examples require cURL http://curl.haxx.se ###
### It is assumed that examples are run from the 'test' directory. Otherwise paths need to be adjusted.
load '../lib/refine.rb'

##########################
### create initial project
##########################
prj = Refine.new({ 'project_name' => 'date_cleanup', 'file_name' => 'dates.csv' })


##########################
### create another project
##########################
prj.create_project( 'date_cleanup', 'dates.txt' ) # return value = project id, example: 1484090391100


################
### do something
################
prj.apply_operations( 'operations.json' ) # return value = status code, example: {'code'=>'ok'}


######################
### extract operations
######################
prj.get_operations # return value = operations as Hash

######################################
### save extracted operations to file:
######################################
extracted_operations = prj.get_operations
File.open('../test/extracted_operations.json', 'w') do |f|
f.write extracted_operations
end


###############
### export data
###############
prj.export_rows # return value = exported data as tsv
prj.export_rows( {'format'=>'tsv'} ) # return value = exported data as tsv
prj.export_rows( {'format'=>'csv'} ) # return value = exported data as csv

### export data in custom table format
prj.export_rows( { 'options'=>{'separator'=>';','lineSeparator'=>'\n'} } ) # return value = exported data as *sv with semicolon for separator

### additional options available:
prj.export_rows( { 'options'=>{'separator'=>';','lineSeparator'=>'\n', 'outputColumnHeaders'=>true, 'outputBlankRows'=>true, 'columns'=>[{'name'=>'Date1'}] } } )
prj.export_rows( { 'options'=>{'separator'=>';','lineSeparator'=>'\n', 'outputColumnHeaders'=>false, 'outputBlankRows'=>false } } )

### save extracted data to file:
exported_data = prj.export_rows( {'format'=>'csv'} )
File.open('../test/exported_data.csv', 'w') do |f| # works
f.write exported_data
end


##################################
### export data using own template
##################################

### construct template as url-encoded string
prefix = '%7B%0D%0A++%22rows%22+%3A+%5B%0D%0A'
suffix = '%0D%0A++%5D%0D%0A%7D'
separator = '%2C%0D%0A'
row_template = '++++%7B%0D%0A++++++%22Column+1%22+%3A+%7B%7Bjsonize%28cells%5B%22Column+1%22%5D.value%29%7D%7D%0D%0A++++%7D'

### call (using cURL http://curl.haxx.se)
data = "engine=%7B%22facets%22%3A%5B%5D%2C%22mode%22%3A%22row-based%22%7D&project=#{prj.project_id}&format=template&sorting=%7B%22criteria%22%3A%5B%5D%7D&prefix=#{prefix}&suffix=#{suffix}&separator=#{separator}&template=#{row_template}"
system "curl --data #{'"' + data + '"'} http://127.0.0.1:3333/command/core/export-rows/"

### save extracted data to file:
system "curl --data #{'"' + data + '"'} http://127.0.0.1:3333/command/core/export-rows/ > exported_data.json"

### let Ruby do the URL encoding of the template
prefix = CGI.escape('{
"rows" : [
')
suffix = CGI.escape('
]
}')
separator = CGI.escape(',
')
row_template = CGI.escape(' {
"Column 1" : {{jsonize(cells["Column 1"].value)}}
}')
data = "engine=%7B%22facets%22%3A%5B%5D%2C%22mode%22%3A%22row-based%22%7D&project=#{prj.project_id}&format=template&sorting=%7B%22criteria%22%3A%5B%5D%7D&prefix=#{prefix}&suffix=#{suffix}&separator=#{separator}&template=#{row_template}"
system "curl --data #{'"' + data + '"'} http://127.0.0.1:3333/command/core/export-rows/"


#################
### rename column
#################
prj.rename_column( { 'oldColumnName'=>'Date', 'newColumnName'=>'Date1' } ) # return value = status Hash, e.g. {"code"=>"ok", "historyEntry"=>{"id"=>1438598625335, "description"=>"Rename column Date to Date1", "time"=>"2015-08-03T12:29:53Z"}}


############
### metadata
############
prj.get_project_metadata # return value = metadata as Hash
prj.get_all_project_metadata # return value = metadata for all projects as Hash


##################
### delete project
##################
prj.delete_project # return value = status, e.g. ok
Loading