##----------------------------------------------------------- ## Instala pacote RPostgres e o de leitura de senha install.packages("RPostgres") install.packages("getPass") ## Instala os pacotes dplyr e o dbplyr install.packages("dplyr") install.packages("dbplyr") ## Instala pacote com dados sobre voos de NY, para testes install.packages("nycflights13") library(dplyr) library(dbplyr) library(RPostgres) ## Carrega o driver PostgreSQL drv <- dbDriver("Postgres") ## Abre uma conexao con <- dbConnect(drv, host="postgresql.linux.ime.usp.br", port="5432", dbname="seu_login", user="seu_login", password=getPass::getPass()) ## Cria uma tabela no BD com dados para teste, copiados do pacote nycflights13, que contem: ## "Airline on-time data for all flights departing NYC in 2013. ## Also includes useful 'metadata' on airlines, airports, weather, and planes." copy_to(con, nycflights13::flights, "flights", temporary = FALSE, indexes = list( c("year", "month", "day"), "carrier", "tailnum", "dest" ) ) ######################################### ## EXEMPLO 1 ## Cria uma referencia para a tabela "flights", que sera usada nas proximas consultas flights_db <- tbl(con, "flights") ## Seleciona as colunas de data, destino, atraso na partida e ## atraso na chegada dos voos delays <- select(flights_db, year:day, dest, dep_delay, arr_delay) ## Seleciona as linhas dos voos que tiveram um grande atraso ## na partida (dep_delay) maior que 240 big_delays <- filter(delays, dep_delay > 240) ## Agrupa os grandes atrasos por destino delays_by_dest <- group_by(big_delays, dest) ## Obtem o atraso medio na partida por destino mean_delays_by_dest <- summarise(delays_by_dest, delay = mean(dep_delay)) print(mean_delays_by_dest) ######################################### ## EXEMPLO 1 -- Versao Pipeline flights_db <- tbl(con, "flights") mean_delays_by_dest <- flights_db %>% select(year:day, dest, dep_delay, arr_delay) %>% filter(dep_delay > 240) %>% group_by(dest) %>% summarise(delay = mean(dep_delay)) print(mean_delays_by_dest) ######################################### ## EXEMPLO 2 ## Atraso médio na chegada por aeronave (identificada por tailnum), ## mas somente paras as aeronaves que fizeram mais do que 100 voos tailnum_delay_db <- flights_db %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay), n = n() ) %>% arrange(desc(delay)) %>% filter(n > 100) ## Mostra a traducao da consulta para SQL tailnum_delay_db %>% show_query() ## Executa a consulta e depois usa a função collect() para trazer todas as linhas da resposta à consulta do BD para o R tailnum_delay <- tailnum_delay_db %>% collect() tailnum_delay ######################################### ## EXEMPLO 3 ## Cria uma nova variavel (dep_type) para classificar se um voo esta no horario certo ou atrasado # e plota um grafico de barras das frequencias dela por companhia aerea library(ggplot2) flights_db %>% mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed")) %>% ggplot(aes(x = carrier, fill = dep_type, main = "Frequency of Delayed vs On Time Arrivals by Carrier")) + geom_bar() ## Fecha a conexao com o BD dbDisconnect(con) ## Libera os recursos usados pelo driver dbUnloadDriver(drv)