pygaR: Example for pygar_search() Function

Load Libraries

library(pygaR)
library(ggplot2)
library(maps)

Setup Default ggplot2 Theme

d <- theme_bw()
d <- d + theme(
    axis.text.x = element_text(angle=90, size=15),
    axis.title = element_text(size=20),
    plot.title = element_text(size=30)
)
def_theme <- d
theme_plot <- function(g){
    g <- g + theme_bw() + theme(
        panel.grid=element_blank(),
        axis.text=element_blank(), axis.ticks=element_blank()
    ) + xlab("") + ylab("")
    g
}

Load States Info/Map

states <- read.csv("states.csv")
states_map <- map_data("state")

Data Analysis

Search Body

Search Header Information

hdrinfo2016 <- pygaR::pygar_search(
    startqtr=201601, endqtr=201604, form="10-K", header=TRUE,
    query=list(
        Filer=list(
            Company.Data=list(
                State.Of.Incorporation='State.Of.Incorporation',
                Standard.Industrial.Classification=
                    'Standard.Industrial.Classification'
            ),
            Business.Address=list(
                State='State'
            )
        )
    )
)
hdrinfo2016 <- hdrinfo2016[!is.na(match(hdrinfo2016$State, states$Code)),]
hdrinfo2016 <- hdrinfo2016[
    !is.na(match(hdrinfo2016$State.Of.Incorporation, states$Code)),
]
hdrinfo2016 <- hdrinfo2016[!duplicated(hdrinfo2016$CIK),]

Aggregate Industrial Classifications

data_industry <- aggregate(CIK ~ Standard.Industrial.Classification,
    data=hdrinfo2016, FUN=unique
)
data_industry$CIK <- as.integer(as.vector(unlist(
    lapply(data_industry$CIK, FUN=length)
)))

names(data_industry) <- c("Industrial.Classification", "CIK")
data_industry$Industrial.Classification <-
    sub("^(.*) \\[[0-9]+\\]$", "\\1", data_industry$Industrial.Classification)
data_industry$CIK <- as.integer(data_industry$CIK)

Graph Industrial Classifications

g <- ggplot(data_industry, aes(x=Industrial.Classification, y=CIK))
g <- g + geom_bar(stat="identity")
g <- g + def_theme
g

plot of chunk graph_industry

Get Top 10%

data_industry_top <- data_industry[order(-data_industry$CIK),]
bottom_value <- data_industry_top$CIK[
    length(data_industry_top$CIK) * 0.1
]
data_industry_top <- data_industry_top[
    data_industry_top$CIK > bottom_value,
]

Graph Top 10% Industrial Classifications

g <- ggplot(data_industry_top, aes(x=Industrial.Classification, y=CIK))
g <- g + geom_bar(stat="identity")
g <- g + theme(axis.text.x = element_text(angle=90, size=12))
g <- g + ggtitle('Top 10% Industrial Classifications')
g

plot of chunk graph_industry_top

Aggregate State of Business Address

data_hq_state <- aggregate(CIK ~ State, data=hdrinfo2016, FUN=unique)
data_hq_state$CIK <- as.integer(as.vector(unlist(
    lapply(data_hq_state$CIK, FUN=length)
)))
names(data_hq_state) <- c("Code", "Count")
data_hq_state$state <- factor(data_hq_state$Code)
data_hq_state$Count <- as.integer(data_hq_state$Count)
data_hq_state <- merge(data_hq_state, states)
data_hq_state$state <- tolower(data_hq_state$Name)
data_hq_state$Name <- NULL
data_hq_state$Code <- NULL
data_hq_state
##    Count                state
## 1      5               alaska
## 2     28              alabama
## 3     31             arkansas
## 4    100              arizona
## 5   1008           california
## 6    192             colorado
## 7    117          connecticut
## 8     19 district of columbia
## 9     55             delaware
## 10   318              florida
## 11   122              georgia
## 12    18               hawaii
## 13    34                 iowa
## 14    18                idaho
## 15   230             illinois
## 16    69              indiana
## 17    32               kansas
## 18    35             kentucky
## 19    31            louisiana
## 20   325        massachusetts
## 21    99             maryland
## 22     7                maine
## 23    89             michigan
## 24   110            minnesota
## 25    72             missouri
## 26    11          mississippi
## 27     8              montana
## 28   139       north carolina
## 29     7         north dakota
## 30    27             nebraska
## 31    10        new hampshire
## 32   233           new jersey
## 33     7           new mexico
## 34   173               nevada
## 35   716             new york
## 36   169                 ohio
## 37    48             oklahoma
## 38    42               oregon
## 39   231         pennsylvania
## 40    15         rhode island
## 41    27       south carolina
## 42    15         south dakota
## 43    83            tennessee
## 44   565                texas
## 45    84                 utah
## 46   156             virginia
## 47     6              vermont
## 48   101           washington
## 49    70            wisconsin
## 50    13        west virginia
## 51     2              wyoming
g <- ggplot(data_hq_state, aes(map_id=state))
g <- g + geom_map(aes(fill=Count), map=states_map)
g <- g + expand_limits(x=states_map$long, y=states_map$lat)
g <- g + ggtitle('2016 Form 10-K Data Business Address State Frequency')
theme_plot(g)

plot of chunk graph_hq_state

Aggregate State of Incorporation

data_inc_state <- aggregate(
    CIK ~ State.Of.Incorporation,
    data=hdrinfo2016, FUN=unique
)
data_inc_state$CIK <- as.integer(as.vector(unlist(
    lapply(data_inc_state$CIK, FUN=length)
)))
names(data_inc_state) <- c("Code", "Count")
data_inc_state$state <- factor(data_inc_state$Code)
data_inc_state$Count <- as.integer(data_inc_state$Count)
data_inc_state <- merge(data_inc_state, states)
data_inc_state$state <- tolower(data_inc_state$Name)
data_inc_state$Name <- NULL
data_inc_state$Code <- NULL
data_inc_state
##    Count                state
## 1      5               alaska
## 2      8              alabama
## 3     14             arkansas
## 4     10              arizona
## 5    140           california
## 6     77             colorado
## 7     22          connecticut
## 8      4 district of columbia
## 9   3338             delaware
## 10   137              florida
## 11    44              georgia
## 12     6               hawaii
## 13    20                 iowa
## 14     6                idaho
## 15    18             illinois
## 16    58              indiana
## 17     5               kansas
## 18    16             kentucky
## 19    17            louisiana
## 20    44        massachusetts
## 21   360             maryland
## 22     5                maine
## 23    40             michigan
## 24    70            minnesota
## 25    30             missouri
## 26     8          mississippi
## 27     3              montana
## 28    64       north carolina
## 29     5         north dakota
## 30     5             nebraska
## 31     1        new hampshire
## 32    53           new jersey
## 33     5           new mexico
## 34   740               nevada
## 35   135             new york
## 36    89                 ohio
## 37    20             oklahoma
## 38    29               oregon
## 39   107         pennsylvania
## 40     5         rhode island
## 41    16       south carolina
## 42     6         south dakota
## 43    22            tennessee
## 44    86                texas
## 45    35                 utah
## 46    82             virginia
## 47     2              vermont
## 48    47           washington
## 49    42            wisconsin
## 50     8        west virginia
## 51    13              wyoming

Graph State of Incorporation

g <- ggplot(data_inc_state, aes(map_id=state))
g <- g + geom_map(aes(fill=Count), map=states_map)
g <- g + expand_limits(x=states_map$long, y=states_map$lat)
g <- g + ggtitle('2016 Form 10-K Data Business Address State Frequency')
theme_plot(g)

plot of chunk graph_inc_state

Graph States of Incorporation

cap <- function(val){
    paste(
        toupper(substring(val,1,1)),
        substring(val,2),
        collapse="", sep=""
    )
}
names(data_inc_state) <- c("Count", "State")
data_inc_state$State <- as.vector(sapply(data_inc_state$State, cap))
g <- ggplot(data_inc_state, aes(x=State, y=Count))
g <- g + geom_bar(stat="identity")
g <- g + def_theme
g <- g + ggtitle('States of Incorporation')
g

plot of chunk graph_states_inc