top10_crops_2021.df <- read.csv(“top10_crops_2021.csv”) %>% arrange(desc(billion_tons))
Dataframe: top5_countries.df
Data Source: USDA/IPAD
Top 5 Soy Countries
top5_countries <- c(‘Brazil’, ‘United States’, ‘China’, ‘Argentina’, ‘India’)
Pct of Soy Production
top5_countries_pct <- c(42, 31, 7, 5, 3)
Create Top 5 Soy Countries df
top5_countries.df <- data.frame(top5_countries, top5_countries_pct) %>%
arrange(desc(top5_countries_pct))
Table: prod_yield_all_states_2000_2022
Data Source: soybeans_prod_yield_2000-22.csv
SQL Query: project_check_null_production_values
SELECT Value
FROM prod_yield_all_states_2000_2022
WHERE Value IS NULL
SQL Query: project_check_max_min_production_values
SELECT
Max(Value) as max_value,
MIN(Value) as min_value
FROM prod_yield_all_states_2000_2022
Table: top_11_soy_states
SQL Query:
project_create_table_top11_states
CREATE TABLE top_11_soy_states AS
SELECT
State as state,
fips.state_postal_abbreviation as st_abv, – from BiqQuery - Census FIPS Utility dataset
SUM(Value) AS bushels_total,
ROUND((SUM(Value) / (SELECT SUM(Value) FROM prod_yield_all_states_2000_2022)) * 100, 2) AS pct_total_bushels,
ROUND(SUM(SUM(Value)) OVER (ORDER BY SUM(Value) DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) / (SELECT SUM(Value) FROM prod_yield_all_states_2000_2022) * 100, 2) AS cumulative_pct
FROM prod_yield_all_states_2000_2022 AS soy
LEFT JOIN bigquery-public-data.census_utility.fips_codes_states AS fips
ON soy.State = UPPER(fips.state_name)
GROUP BY state, st_abv
ORDER BY bushels_total DESC
LIMIT 11
Table: prod_yield_top11_states
SQL Query:
project_create_table_prod_data_top11_states
# Create Table of Selected Soy Data Columns for Top 11 States
CREATE TABLE prod_yield_top11_states AS
SELECT
# filter for the relevant data columns found in the complete dataset
all_states.Year as year,
all_states.State as state,
# Rename Data_Item values to more concise descriptors
CASE all_states.Data_Item
WHEN ‘SOYBEANS - PRODUCTION, MEASURED IN BU’ THEN ‘total_bushels’
WHEN ‘SOYBEANS - YIELD, MEASURED IN BU / ACRE’ THEN ‘bushels_acre’
ELSE all_states.Data_Item
END as measure, # rename column to more concise descriptor
all_states.Value as quantity # rename column to more concise descriptor
# Soybeans dataset from NASS for all states
From prod_yield_all_states_2000_2022 as all_states
# Subset of all_states dataset to filter for top 11 soybean producing states
JOIN top_11_soy_states as top_states
ON all_states.State = top_states.state # Inner Join filters dataset for top 11 states
GROUP BY state, year, measure, quantity
ORDER BY year, state, measure
Table: soybean_prices_monthly_top11_states
SQL Query:
project_create_table_monthly_prices_2000-22_top_11_states
# Create Table of Monthly Soybean Prices for Top 11 States
CREATE TABLE soybean_prices_monthly_top11_states AS
SELECT
# Cast Year(INT) as STRING, then Concat with Period(MON) and parse the resulting string as DATE
PARSE_DATE(‘%Y %b’, CONCAT(CAST(all_states.Year as STRING),” “, all_states.Period)) as price_period,
all_states.State as state,
top_states.st_abv,
CAST(all_states.Value AS FLOAT64) as usd_bushel, –value in dataset is USD/bushels; cast as INT
# Soybeans dataset from NASS for all states
From prices_bushel_pct_parity_2000-22 as all_states
# Subset of all_states dataset to filter for top 11 soybean producing states
JOIN top_11_soy_states as top_states
ON all_states.State = top_states.state # Inner Join filters dataset for top 11 states
WHERE NOT CONTAINS_SUBSTR(all_states.Value,”(D)“) # NASS witholds data that will identify specific growers
GROUP BY state, st_abv, price_period, usd_bushel
ORDER BY price_period
Table: bigquery-public-data.noaa_gsod.stations
SQL Query: project_verify_usaf_wban_lengths
# verify lengths of the usaf and wban codes used in the NOAA stations table
SELECT
MIN(CHAR_LENGTH(usaf)) as min_usaf_length,
MAX(CHAR_LENGTH(usaf)) as max_usaf_length,
MIN(CHAR_LENGTH(wban)) as min_length_wban,
MAX(CHAR_LENGTH(wban)) as max_length_wban
FROM bigquery-public-data.noaa_gsod.stations
Table: soy_states_noaa_stns
SQL Query: project_verify_stn_code_max_min_length
# verify the length of the station code used in annual GSOD tables
SELECT
MIN(CHAR_LENGTH(stn)) as min_stn_length,
MAX(CHAR_LENGTH(stn)) as max_stn_length
FROM bigquery-public-data.noaa_gsod.gsod2000
SQL Query: project_us_weather_stns_nulls
– check whether generic code ‘999999’ used for usaf codes for stations within top soy states
SELECT * FROM bigquery-public-data.noaa_gsod.stations as noaa
JOIN top_11_soy_states as soy
ON noaa.state = soy.st_abv
WHERE noaa.usaf = ‘999999’ # gsod.stations uses ‘999999’ as generic station code for multiple stations
SQL Query: project_concat_wban_soy_states
# verify whether gsod station reports used 5 digit wban codes with a leading “0” as the 6 digit stn code
WITH soy_wban AS
(SELECT CONCAT(“0”,wban) as xwban
FROM soy_states_noaa_stns
WHERE wban != “99999”) # “99999” used as null values
SELECT
DISTINCT(gs.station),
gs.location,
gs.state
FROM weather_top11_states as gs
JOIN soy_wban
ON soy_wban.xwban = gs.station
ORDER BY gs.station
SQL Query: project_create_table_soy_states_noaa_stns
CREATE TABLE soy_states_noaa_stns AS
SELECT usaf, wban, name, soy.state, st_abv
FROM bigquery-public-data.noaa_gsod.stations as noaa
JOIN top_11_soy_states as soy
ON noaa.state = soy.st_abv
WHERE noaa.usaf != ‘999999’ – gsod.stations uses ‘999999’ as generic station code for multiple stations
ORDER BY noaa.state
Table: weather_top11_states
SQL Query: project_create_table_weather_top11_states_nulls
# – Create Table of Weather Data for Top 11 States
CREATE TABLE weather_top11_states_nulls AS
SELECT
all_states.stn as station,
# Concat with year, mo, da and parse the resulting string as a DATE
PARSE_DATE(‘%F’, CONCAT(all_states.year,“-”,all_states.mo,“-”,all_states.da)) as weather_date,
top_states.name as location,
top_states.state,
top_states.st_abv,
NULLIF(all_states.temp, 9999.9) as mean_temp,
NULLIF(all_states.max, 9999.9) as max_temp,
NULLIF(all_states.min, 9999.9) as min_temp,
NULLIF(all_states.prcp, 99.99) as precip
# Weather dataset from Big Query GSOD - Union all datasets 2000-2022 for all states
From bigquery-public-data.noaa_gsod.gsod20* as all_states
# Join on Top 11 States NOAA stations to filter for top 11 States
JOIN soy_states_noaa_stns as top_states
ON
all_states.stn = top_states.usaf # Inner Join filters dataset by NOAA stations for top 11 states
WHERE
NOT all_states.stn = “999999” – Generic code assigned to numerous stations globally
AND
NOT CONTAINS_SUBSTR(all_states.year, “2023”)
GROUP BY
station, location, state, st_abv, weather_date, mean_temp, max_temp, min_temp, precip