pygaR: Example for pygar_search() Function
Load Libraries
library(pygaR)
library(ggplot2)
library(maps)
Setup Default ggplot2 Theme
d <- theme_bw()
d <- d + theme(
axis.text.x = element_text(angle=90, size=15),
axis.title = element_text(size=20),
plot.title = element_text(size=30)
)
def_theme <- d
theme_plot <- function(g){
g <- g + theme_bw() + theme(
panel.grid=element_blank(),
axis.text=element_blank(), axis.ticks=element_blank()
) + xlab("") + ylab("")
g
}
Load States Info/Map
states <- read.csv("states.csv")
states_map <- map_data("state")
Data Analysis
Search Body
Search Header Information
hdrinfo2016 <- pygaR::pygar_search(
startqtr=201601, endqtr=201604, form="10-K", header=TRUE,
query=list(
Filer=list(
Company.Data=list(
State.Of.Incorporation='State.Of.Incorporation',
Standard.Industrial.Classification=
'Standard.Industrial.Classification'
),
Business.Address=list(
State='State'
)
)
)
)
hdrinfo2016 <- hdrinfo2016[!is.na(match(hdrinfo2016$State, states$Code)),]
hdrinfo2016 <- hdrinfo2016[
!is.na(match(hdrinfo2016$State.Of.Incorporation, states$Code)),
]
hdrinfo2016 <- hdrinfo2016[!duplicated(hdrinfo2016$CIK),]
Aggregate Industrial Classifications
data_industry <- aggregate(CIK ~ Standard.Industrial.Classification,
data=hdrinfo2016, FUN=unique
)
data_industry$CIK <- as.integer(as.vector(unlist(
lapply(data_industry$CIK, FUN=length)
)))
names(data_industry) <- c("Industrial.Classification", "CIK")
data_industry$Industrial.Classification <-
sub("^(.*) \\[[0-9]+\\]$", "\\1", data_industry$Industrial.Classification)
data_industry$CIK <- as.integer(data_industry$CIK)
Graph Industrial Classifications
g <- ggplot(data_industry, aes(x=Industrial.Classification, y=CIK))
g <- g + geom_bar(stat="identity")
g <- g + def_theme
g
Get Top 10%
data_industry_top <- data_industry[order(-data_industry$CIK),]
bottom_value <- data_industry_top$CIK[
length(data_industry_top$CIK) * 0.1
]
data_industry_top <- data_industry_top[
data_industry_top$CIK > bottom_value,
]
Graph Top 10% Industrial Classifications
g <- ggplot(data_industry_top, aes(x=Industrial.Classification, y=CIK))
g <- g + geom_bar(stat="identity")
g <- g + theme(axis.text.x = element_text(angle=90, size=12))
g <- g + ggtitle('Top 10% Industrial Classifications')
g
Aggregate State of Business Address
data_hq_state <- aggregate(CIK ~ State, data=hdrinfo2016, FUN=unique)
data_hq_state$CIK <- as.integer(as.vector(unlist(
lapply(data_hq_state$CIK, FUN=length)
)))
names(data_hq_state) <- c("Code", "Count")
data_hq_state$state <- factor(data_hq_state$Code)
data_hq_state$Count <- as.integer(data_hq_state$Count)
data_hq_state <- merge(data_hq_state, states)
data_hq_state$state <- tolower(data_hq_state$Name)
data_hq_state$Name <- NULL
data_hq_state$Code <- NULL
data_hq_state
## Count state
## 1 5 alaska
## 2 28 alabama
## 3 31 arkansas
## 4 100 arizona
## 5 1008 california
## 6 192 colorado
## 7 117 connecticut
## 8 19 district of columbia
## 9 55 delaware
## 10 318 florida
## 11 122 georgia
## 12 18 hawaii
## 13 34 iowa
## 14 18 idaho
## 15 230 illinois
## 16 69 indiana
## 17 32 kansas
## 18 35 kentucky
## 19 31 louisiana
## 20 325 massachusetts
## 21 99 maryland
## 22 7 maine
## 23 89 michigan
## 24 110 minnesota
## 25 72 missouri
## 26 11 mississippi
## 27 8 montana
## 28 139 north carolina
## 29 7 north dakota
## 30 27 nebraska
## 31 10 new hampshire
## 32 233 new jersey
## 33 7 new mexico
## 34 173 nevada
## 35 716 new york
## 36 169 ohio
## 37 48 oklahoma
## 38 42 oregon
## 39 231 pennsylvania
## 40 15 rhode island
## 41 27 south carolina
## 42 15 south dakota
## 43 83 tennessee
## 44 565 texas
## 45 84 utah
## 46 156 virginia
## 47 6 vermont
## 48 101 washington
## 49 70 wisconsin
## 50 13 west virginia
## 51 2 wyoming
g <- ggplot(data_hq_state, aes(map_id=state))
g <- g + geom_map(aes(fill=Count), map=states_map)
g <- g + expand_limits(x=states_map$long, y=states_map$lat)
g <- g + ggtitle('2016 Form 10-K Data Business Address State Frequency')
theme_plot(g)
Aggregate State of Incorporation
data_inc_state <- aggregate(
CIK ~ State.Of.Incorporation,
data=hdrinfo2016, FUN=unique
)
data_inc_state$CIK <- as.integer(as.vector(unlist(
lapply(data_inc_state$CIK, FUN=length)
)))
names(data_inc_state) <- c("Code", "Count")
data_inc_state$state <- factor(data_inc_state$Code)
data_inc_state$Count <- as.integer(data_inc_state$Count)
data_inc_state <- merge(data_inc_state, states)
data_inc_state$state <- tolower(data_inc_state$Name)
data_inc_state$Name <- NULL
data_inc_state$Code <- NULL
data_inc_state
## Count state
## 1 5 alaska
## 2 8 alabama
## 3 14 arkansas
## 4 10 arizona
## 5 140 california
## 6 77 colorado
## 7 22 connecticut
## 8 4 district of columbia
## 9 3338 delaware
## 10 137 florida
## 11 44 georgia
## 12 6 hawaii
## 13 20 iowa
## 14 6 idaho
## 15 18 illinois
## 16 58 indiana
## 17 5 kansas
## 18 16 kentucky
## 19 17 louisiana
## 20 44 massachusetts
## 21 360 maryland
## 22 5 maine
## 23 40 michigan
## 24 70 minnesota
## 25 30 missouri
## 26 8 mississippi
## 27 3 montana
## 28 64 north carolina
## 29 5 north dakota
## 30 5 nebraska
## 31 1 new hampshire
## 32 53 new jersey
## 33 5 new mexico
## 34 740 nevada
## 35 135 new york
## 36 89 ohio
## 37 20 oklahoma
## 38 29 oregon
## 39 107 pennsylvania
## 40 5 rhode island
## 41 16 south carolina
## 42 6 south dakota
## 43 22 tennessee
## 44 86 texas
## 45 35 utah
## 46 82 virginia
## 47 2 vermont
## 48 47 washington
## 49 42 wisconsin
## 50 8 west virginia
## 51 13 wyoming
Graph State of Incorporation
g <- ggplot(data_inc_state, aes(map_id=state))
g <- g + geom_map(aes(fill=Count), map=states_map)
g <- g + expand_limits(x=states_map$long, y=states_map$lat)
g <- g + ggtitle('2016 Form 10-K Data Business Address State Frequency')
theme_plot(g)
Graph States of Incorporation
cap <- function(val){
paste(
toupper(substring(val,1,1)),
substring(val,2),
collapse="", sep=""
)
}
names(data_inc_state) <- c("Count", "State")
data_inc_state$State <- as.vector(sapply(data_inc_state$State, cap))
g <- ggplot(data_inc_state, aes(x=State, y=Count))
g <- g + geom_bar(stat="identity")
g <- g + def_theme
g <- g + ggtitle('States of Incorporation')
g