協作閣

開源協作部落格

Patent document 1

pizza

Freya / 2019-03-29 /


#install.packages("readr")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("ggthemes")
library(readr)
library(dplyr)
library(ggplot2)
library(ggthemes)
# This pizza dataset is the patent documents from the WIPO Patentscope database. 
library(readr)
library(dplyr)
pizza =  read_csv("pizza.csv") %>% 
    select(-applicants_cleaned, -applicants_cleaned_type, -applicants_original, -inventors_cleaned, 
        -inventors_original)  
head(pizza)
#> # A tibble: 6 x 26
#>   applicants_orga… ipc_class ipc_codes ipc_names ipc_original ipc_subclass_co…
#>   <chr>            <chr>     <chr>     <chr>     <chr>        <chr>           
#> 1 <NA>             A21: Bak… A21D 13/… A21D 13/… A21D 13/00;… A21D; A23L      
#> 2 <NA>             A21: Bak… A21B 3/13 A21B 3/1… A21B 3/13    A21B            
#> 3 <NA>             A21: Bak… A21C 15/… A21C 15/… A21C 15/04   A21C            
#> 4 Lazarillo De To… A21: Bak… A21D 13/… A21D 13/… A21D 13/00;… A21D; A23L      
#> 5 <NA>             B65: Con… B65D 21/… B65D 21/… B65D 21/032… B65D            
#> 6 <NA>             B65: Con… B65D 85/… B65D 85/… B65D 85/36   B65D            
#> # … with 20 more variables: ipc_subclass_detail <chr>,
#> #   ipc_subclass_names <chr>, priority_country_code <chr>,
#> #   priority_country_code_names <chr>, priority_data_original <chr>,
#> #   priority_date <chr>, publication_country_code <chr>,
#> #   publication_country_name <chr>, publication_date <chr>,
#> #   publication_date_original <chr>, publication_day <dbl>,
#> #   publication_month <dbl>, publication_number <chr>,
#> #   publication_number_espacenet_links <chr>, publication_year <dbl>,
#> #   title_cleaned <chr>, title_nlp_cleaned <chr>,
#> #   title_nlp_multiword_phrases <chr>, title_nlp_raw <chr>,
#> #   title_original <chr>
head(pizza)
#> # A tibble: 6 x 26
#>   applicants_orga… ipc_class ipc_codes ipc_names ipc_original ipc_subclass_co…
#>   <chr>            <chr>     <chr>     <chr>     <chr>        <chr>           
#> 1 <NA>             A21: Bak… A21D 13/… A21D 13/… A21D 13/00;… A21D; A23L      
#> 2 <NA>             A21: Bak… A21B 3/13 A21B 3/1… A21B 3/13    A21B            
#> 3 <NA>             A21: Bak… A21C 15/… A21C 15/… A21C 15/04   A21C            
#> 4 Lazarillo De To… A21: Bak… A21D 13/… A21D 13/… A21D 13/00;… A21D; A23L      
#> 5 <NA>             B65: Con… B65D 21/… B65D 21/… B65D 21/032… B65D            
#> 6 <NA>             B65: Con… B65D 85/… B65D 85/… B65D 85/36   B65D            
#> # … with 20 more variables: ipc_subclass_detail <chr>,
#> #   ipc_subclass_names <chr>, priority_country_code <chr>,
#> #   priority_country_code_names <chr>, priority_data_original <chr>,
#> #   priority_date <chr>, publication_country_code <chr>,
#> #   publication_country_name <chr>, publication_date <chr>,
#> #   publication_date_original <chr>, publication_day <dbl>,
#> #   publication_month <dbl>, publication_number <chr>,
#> #   publication_number_espacenet_links <chr>, publication_year <dbl>,
#> #   title_cleaned <chr>, title_nlp_cleaned <chr>,
#> #   title_nlp_multiword_phrases <chr>, title_nlp_raw <chr>,
#> #   title_original <chr>
library(dplyr)
pizza =  mutate(pizza, record_count = sum(publication_number = 1))
pizza =  rename(pizza, pubcountry = publication_country_name, pubcode = publication_country_code, 
    pubyear = publication_year)
head(pizza)
#> # A tibble: 6 x 27
#>   applicants_orga… ipc_class ipc_codes ipc_names ipc_original ipc_subclass_co…
#>   <chr>            <chr>     <chr>     <chr>     <chr>        <chr>           
#> 1 <NA>             A21: Bak… A21D 13/… A21D 13/… A21D 13/00;… A21D; A23L      
#> 2 <NA>             A21: Bak… A21B 3/13 A21B 3/1… A21B 3/13    A21B            
#> 3 <NA>             A21: Bak… A21C 15/… A21C 15/… A21C 15/04   A21C            
#> 4 Lazarillo De To… A21: Bak… A21D 13/… A21D 13/… A21D 13/00;… A21D; A23L      
#> 5 <NA>             B65: Con… B65D 21/… B65D 21/… B65D 21/032… B65D            
#> 6 <NA>             B65: Con… B65D 85/… B65D 85/… B65D 85/36   B65D            
#> # … with 21 more variables: ipc_subclass_detail <chr>,
#> #   ipc_subclass_names <chr>, priority_country_code <chr>,
#> #   priority_country_code_names <chr>, priority_data_original <chr>,
#> #   priority_date <chr>, pubcode <chr>, pubcountry <chr>,
#> #   publication_date <chr>, publication_date_original <chr>,
#> #   publication_day <dbl>, publication_month <dbl>, publication_number <chr>,
#> #   publication_number_espacenet_links <chr>, pubyear <dbl>,
#> #   title_cleaned <chr>, title_nlp_cleaned <chr>,
#> #   title_nlp_multiword_phrases <chr>, title_nlp_raw <chr>,
#> #   title_original <chr>, record_count <dbl>
p1 =  pizza %>% select(., pubcountry, pubcode, pubyear, record_count)
head(p1)
#> # A tibble: 6 x 4
#>   pubcountry                 pubcode pubyear record_count
#>   <chr>                      <chr>     <dbl>        <dbl>
#> 1 United States of America   US         2009            1
#> 2 United States of America   US         2014            1
#> 3 United States of America   US         2013            1
#> 4 European Patent Office     EP         2007            1
#> 5 United States of America   US         2003            1
#> 6 Patent Co-operation Treaty WO         2002            1
pt =  count(p1, pubyear, wt = record_count)
head(pt)
#> # A tibble: 6 x 2
#>   pubyear     n
#>     <dbl> <dbl>
#> 1    1940     1
#> 2    1954     1
#> 3    1956     1
#> 4    1957     1
#> 5    1959     1
#> 6    1962     1
#install.packages("xquartz")
qplot(x = pubyear, y = n, data = pt, geom = "line")

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.