Goal
I’m continuing my experiments with postgresql/postgis databases, using the Global Historical Climate Network (GHCN v2) dataset of monthly precipitation and temperature data from around the world. I’m going to create and query a database out of GHCN data, downloaded here on May 4, 2010.
Tips
Read the metadata on the site, because the format is a bit tedious.The main things to note are:
- i had to convert the station list table v2.precip.inv to a csv file before i could copy it into a postgres table
- i had to import the ~80Mb datafile (v2.prcp) as a postgres table of text rows, and then parse the actual columns out (see sql below) into a new table
- the code in the stations list table has one digit less than that in the data table – a code for whether there are data duplicates or not (read the metadata file online)
- i’ve used the raw data here. there is a separate quality-controlled table of data that is MUCH smaller – i have not used that here.
- -9999 represents null data, -8888 respresents trace precipitation
Code
–create GHCN station table
– drop table ghcn_stations;
create table ghcn_stations (code numeric primary key not null, name varchar(40), country varchar(25), latitude float, longitude float, elevation integer) with OIDS;
–load from text file
copy ghcn_stations FROM ‘E:/gisdata/World/GHCNv2/v2.prcp.inv.csv’ CSV;
–select stations in Uganda
select * from ghcn_stations where country = ‘UGANDA’;
–make a geometry column out of lat and longitude
Alter table ghcn_stations add column the_geom geometry;
–populate with lat and long and 4326 srid
Update ghcn_stations
SET the_geom = PointFromText(‘Point(‘||longitude||”||latitude||’)',4326);
–above did not work
Update ghcn_stations
SET the_geom = ST_Point(longitude,latitude);
–above worked but srid not yet set
Update ghcn_stations
SET the_geom = St_SETSRID(the_geom,4326);
–above set the SRID
–
–now add spatial index
create index idx_ghcn_stations on ghcn_stations using GIST(the_geom);
VACUUM ANALYZE;
–test out how to extract select characters from a numeric field, below worked
select country, substring(cast(code as text) from 1 for 5)) from ghcn_stations
where country = ‘UGANDA’ OR country = ‘TANZANIA’ or country = ‘KENYA’;
–load from text file
copy precip FROM ‘E:/gisdata/World/GHCNv2/v2.prcp.csv’;
–does not work well, reading each line as one long text string
–also could not load big file in excel to convert to csv
–so tried reading it as individual text lines to parse later
select * from precip limit 5;
select substring(codeyr from 1 for 16) from precip limit 10;–this extracts the station code and year i.e. column1
select substring(codeyr from 17 for 5)::numeric from precip limit 100;–this extracts i think the month of jan
select substring(codeyr from 22 for 5)::numeric from precip limit 100;–this extracts i think the month of feb
select substring(codeyr from 27 for 5)::numeric from precip limit 100;–this extracts i think the month of mar
select substring(codeyr from 32 for 5)::numeric from precip limit 100;–this extracts i think the month of apr
select substring(codeyr from 37 for 5)::numeric from precip limit 100;–this extracts i think the month of may
select substring(codeyr from 42 for 5)::numeric from precip limit 100;–this extracts i think the month of jun
select substring(codeyr from 47 for 5)::numeric from precip limit 100;–this extracts i think the month of july
select substring(codeyr from 52 for 5)::numeric from precip limit 100;–this extracts i think the month of aug
select substring(codeyr from 57 for 5)::numeric from precip limit 100;–this extracts i think the month of sep
select substring(codeyr from 62 for 5)::numeric from precip limit 100;–this extracts i think the month of oct
select substring(codeyr from 67 for 5)::numeric from precip limit 100;–this extracts i think the month of nov
select substring(codeyr from 72 for 5)::numeric from precip limit 100;–this extracts i think the month of dec
– so now create a new table with the data parsed right
create table newprecip as
select substring(codeyr from 1 for 11)::numeric as code,
substring(codeyr from 12 for 1)::integer as duplicates,
substring(codeyr from 13 for 4)::integer as year,
substring(codeyr from 17 for 5)::integer as m1,
substring(codeyr from 22 for 5)::integer as m2,
substring(codeyr from 27 for 5)::integer as m3,
substring(codeyr from 32 for 5)::integer as m4,
substring(codeyr from 37 for 5)::integer as m5,
substring(codeyr from 42 for 5)::integer as m6,
substring(codeyr from 47 for 5)::integer as m7,
substring(codeyr from 52 for 5)::integer as m8,
substring(codeyr from 57 for 5)::integer as m9,
substring(codeyr from 62 for 5)::integer as m10,
substring(codeyr from 67 for 5)::integer as m11,
substring(codeyr from 72 for 5)::integer as m12
from precip
;
–check first 12 rows, looks ok
select * from newprecip limit 12;
code|duplicates|year|m1|m2|m3|m4|m5|m6|m7|m8|m9|m10|m11|m12|gid
20743086004|1|1962|0|0|0|833|0|663|3005|1658|2470|419|548|786|1
20743086004|1|1963|0|0|0|-9999|-9999|-9999|1061|2422|-9999|736|0|0|2
20743086004|1|1964|0|0|0|0|0|971|-9999|1963|-9999|86|20|0|3
20743086004|1|1965|0|0|38|84|0|450|2039|1397|1896|0|0|0|4
20743086004|1|1966|1194|0|30|73|128|1011|5265|2544|3710|981|780|240|5
20743086004|1|1967|60|0|380|0|0|2020|3310|2161|1641|0|0|887|6
20743086004|1|1968|340|150|10|30|100|1046|3319|893|2096|729|70|60|7
20743086004|1|1969|330|0|30|0|270|1100|2209|1647|3933|140|150|160|8
20743086004|1|1970|215|0|-9999|40|520|2014|1325|1734|1748|116|0|0|9
20743086005|1|1951|0|0|610|38|259|1473|2974|2136|1186|424|0|0|10
20743086005|1|1952|155|0|10|107|102|889|2532|1994|2624|396|0|0|11
20743086005|1|1953|165|0|0|356|0|4389|3406|5570|3129|922|0|0|12
– add primary key note that it needs to be unique so i have to add a serial first
alter table newprecip add column gid serial not null;
create unique index uidx_newprecip_gid on newprecip using btree(gid);
–
alter table newprecip add constraint
pkey__newprecip_code PRIMARY KEY(gid);
–try out some select queries like extracting data for Masaka, Uganda
select newprecip.*, ghcn_stations.name, ghcn_stations.code
from
ghcn_stations join newprecip
on ghcn_stations.code = newprecip.code
where ghcn_stations.country = ‘UGANDA’
order by name;
–lets try and extract the min and max year of available data
select ghcn_stations.name,min(newprecip.year) as minyr, max(newprecip.year) as maxyr
from newprecip, ghcn_stations
where ghcn_stations.country = ‘UGANDA’
and newprecip.code = ghcn_stations.code
group by ghcn_stations.name
order by maxyr DESC, name;
– but MASAKA in Uganda does not appear in this list…why?
– maybe the codes in the 2 tables that i use to join above dont match?
select ghcn_stations.code as code1
from ghcn_stations
where ghcn_stations.name= ‘MASAKA’;
–
select * from ghcn_stations
where name = ‘MASAKA’;
– both of the above find Masaka
– BUT the country name is missing for MASAKA in the ghcn_stations table
– so lets add it
update ghcn_stations
set country = ‘UGANDA’ where code = 15363705001;
–
VACUUM ANALYZE;
–now repeat earlier query
–lets try and extract the min and max year of available data
select ghcn_stations.name,min(newprecip.year) as minyr, max(newprecip.year) as maxyr
from newprecip, ghcn_stations
where ghcn_stations.country = ‘UGANDA’
and newprecip.code = ghcn_stations.code
group by ghcn_stations.name
order by name;
name|minyr|maxyr
ADUKA|1940|1985
AGORO|1940|1983
ARUA|1940|1992
BUHUKA|1940|1978
BULISA|1940|1985
BUNDIBUGYO|1940|1975
BUNYARUGURU|1940|1975
BUSENYI|1940|1985
BUSIKA|1940|1985
BUTIABA|1904|1979
BUTITI|1940|1985
BWIJANGA|1940|1985
ENTEBBE AIRPORT|1896|2000
FT.PORTAL/SEBUTOLE|1903|1980
GULU|1911|2000
IHUNGU|1940|1984
JINJA|1902|1992
KABALE|1917|1996
KABERAMAINDO|1940|1985
KAKOGE|1940|1975
KALANGALA|1926|1977
KAMPALA|1912|1992
KAPIRI|1940|1992
KASESE|1972|1992
KATAKWI|1927|1984
KAZIBA|1940|1975
KISOMORO|1940|1976
KITGUM V.T.C|1914|1995
KITWE|1940|1975
KOTIDO|1940|1985
KYENJOJO|1940|1982
LIRA NGETTA|1940|1992
LUGANZI|1940|1982
MAGYO|1940|1985
MASAFU|1940|1983
MASAKA|1902|1945
MASINDI|1940|1992
MATIRI|1940|1985
MATUNGA|1940|1992
MBALE|1908|1996
MBARARA|1903|1996
MITYAMA|1940|1985
MOYO|1940|1980
MPUGWE|1940|1985
MUKONO|1940|1985
MUTUNGA|1940|1985
NAMASAGALI/B.GOMBOLO|1915|1979
NAMULONGE|1961|1992
NAMWIWA|1940|1980
NGORA|1940|1982
NTWETWE|1940|1984
SERERE|1940|1985
TORORO|1961|1992
WOBULENZI|1940|1979
– YES! Masaka is there now. Note that Masaka has data only between 1902 and1945…
– Keep in mind that this was only a particular fix.
– There are many null values in the countries field of the original GHCN v2 stations table
– TO DO: these should ideally be filled by a spatial point-in-polygon process…
– …with the help of a countries (multi)polygon spatial table…