Data Dictionary

Bronze

Published

May 8, 2025

Modified

May 17, 2025

LDI


Overview

The process starts by web scraping the IES Awards website.

There are several outputs of the web scraping process:

  • ies_award_links - this file contains a list of award URLs from which the following files are derived
  • ies_award_elements_short AKA ies short - this file contains structured data that can easily be transformed into a dataframe (e.g., award number, amount, year, title, etc.)
  • ies_award_elements_long AKA ies long - this file contains unstructured data, mainly purpose and project_activities
  • ies_award_elements_archive - a backup of the IES awards website, contains all HTML
  • ies_award_elements_urls - contains the award urls from ies_award_links but in a key value pair (award number: URL)

This data is housed in the bronze database.

File and output structure

flowchart TB

    %% pipelines
    A["pipeline: award_links"]
    B["pipeline: award_elements"]
    C["pipeline: convert_award_elements_short_dataframe"]
    D["pipeline: archive"]

    %% output
    AA["file: ies_award_links.json"]
    AB["file: ies_award_elements_short.json"]
    AC["file: ies_award_elements_long.json"]
    AD["file: ies_award_elements_short.parquet"]
    AE["file: ies_award_archive.json"]
    AF["file: ies_award_urls.json"]

    %% flow
    A --> AA
    AA --> B   
    AA --> D

    D --> AE

    B --> AB
    B --> AF
    B --> AC
    
    AB --> C --> AD

Schema

flowchart TB
    
    %% Data
    A[IES Award Website]
    B[ies_award_links]
    C[ies_award_elements_short]
    D[ies_award_elements_long]
    E[ies_award_elements_urls]
    F[ies_award_elements_archive]

    %% Databases
    bronze[(Database Bronze)]
    
    %% Flow
    A --> B
    B --> C --> bronze
    B --> D --> bronze
    B --> E --> bronze
    B --> F --> bronze

erDiagram
    "ies_award_links" {
        int         last_page   "not null"
        date        as_of       "not null"
        list[str]   links       "not null"
    }

Bronze tables

erDiagram
    "ies_award_elements_short" {
        list award_number  "null"
        str award_number_clean PK "not null"
        str name   "not null"
        str status  "not null"
        str award_type  "not null"
        list office  "not null"
        list program  "not null"
        list program_topics  "null"
        list evaluation_topics  "null"
        str principal_investigator  "null"
        int amount  "not null"
        str awardee  "not null"
        int award_year  "null"
        date award_date_range_start  "null"
        date award_date_range_end  "null"
        str project_type  "null"
        date as_of  "not null"
    }

erDiagram
    "ies_award_elements_long" {
        list award_number       PK  "null"
        str  purpose                "null"
        str  project_activities     "null"
        date as_of                  "not null"
    }

erDiagram
    "ies_award_elements_urls" {
        list award_number       PK  "null"
        str  url                    "not null"
    }

erDiagram
    "ies_award_elements_archive" {
        str url     PK
        str html
    }