GHCN data in a Postgis database

Goal

I’m continuing my experiments with postgresql/postgis databases, using the Global Historical Climate Network (GHCN v2)  dataset of monthly precipitation and temperature data from around the world. I’m going to create and query a database out of GHCN data, downloaded here on May 4, 2010.

Tips

Read the metadata on the site, because the format is a bit tedious.The main things to note are:

  • i had to convert the station list table v2.precip.inv to a csv file before i could copy it into a postgres table
  • i had to import the ~80Mb datafile (v2.prcp) as a postgres table of text rows, and then parse the actual columns out (see sql below) into a new table
  • the code in the stations list table has one digit less than that in the data table – a code for whether there are data duplicates or not (read the metadata file online)
  • i’ve used the raw data here. there is a separate quality-controlled table of data that is MUCH smaller – i have not used that here.
  • -9999 represents null data, -8888 respresents trace precipitation

Code

–create GHCN station table

– drop table ghcn_stations;

create table ghcn_stations (code numeric primary key not null, name varchar(40), country varchar(25), latitude float, longitude float, elevation integer) with OIDS;

–load from text file

copy ghcn_stations FROM ‘E:/gisdata/World/GHCNv2/v2.prcp.inv.csv’ CSV;

–select stations in Uganda

select * from ghcn_stations where country = ‘UGANDA’;

–make a geometry column out of lat and longitude

Alter table ghcn_stations add column the_geom geometry;

–populate with lat and long and 4326 srid

Update ghcn_stations

SET the_geom = PointFromText(‘Point(‘||longitude||”||latitude||’)',4326);

–above did not work

Update ghcn_stations

SET the_geom = ST_Point(longitude,latitude);

–above worked but srid not yet set

Update ghcn_stations

SET the_geom = St_SETSRID(the_geom,4326);

–above set the SRID

–now add spatial index

create index idx_ghcn_stations on ghcn_stations using GIST(the_geom);

VACUUM ANALYZE;

–test out how to extract select characters from a numeric field, below worked

select country, substring(cast(code as text) from 1 for 5)) from ghcn_stations

where country = ‘UGANDA’ OR country = ‘TANZANIA’ or country = ‘KENYA’;

–load from text file

copy precip FROM ‘E:/gisdata/World/GHCNv2/v2.prcp.csv’;

–does not work well, reading each line as one long text string

–also could not load big file in excel to convert to csv

–so tried reading it as individual text lines to parse later

select * from precip limit 5;

select substring(codeyr from 1 for 16) from precip limit 10;–this extracts the station code and year i.e. column1

select substring(codeyr from 17 for 5)::numeric from precip limit 100;–this extracts i think the month of jan

select substring(codeyr from 22 for 5)::numeric from precip limit 100;–this extracts i think the month of feb

select substring(codeyr from 27 for 5)::numeric from precip limit 100;–this extracts i think the month of mar

select substring(codeyr from 32 for 5)::numeric from precip limit 100;–this extracts i think the month of apr

select substring(codeyr from 37 for 5)::numeric from precip limit 100;–this extracts i think the month of may

select substring(codeyr from 42 for 5)::numeric from precip limit 100;–this extracts i think the month of jun

select substring(codeyr from 47 for 5)::numeric from precip limit 100;–this extracts i think the month of july

select substring(codeyr from 52 for 5)::numeric from precip limit 100;–this extracts i think the month of aug

select substring(codeyr from 57 for 5)::numeric from precip limit 100;–this extracts i think the month of sep

select substring(codeyr from 62 for 5)::numeric from precip limit 100;–this extracts i think the month of oct

select substring(codeyr from 67 for 5)::numeric from precip limit 100;–this extracts i think the month of nov

select substring(codeyr from 72 for 5)::numeric from precip limit 100;–this extracts i think the month of dec

– so now create a new table with the data parsed right

create table newprecip as

select substring(codeyr from 1 for 11)::numeric as code,

substring(codeyr from 12 for 1)::integer as duplicates,

substring(codeyr from 13 for 4)::integer as year,

substring(codeyr from 17 for 5)::integer as m1,

substring(codeyr from 22 for 5)::integer as m2,

substring(codeyr from 27 for 5)::integer as m3,

substring(codeyr from 32 for 5)::integer as m4,

substring(codeyr from 37 for 5)::integer as m5,

substring(codeyr from 42 for 5)::integer as m6,

substring(codeyr from 47 for 5)::integer as m7,

substring(codeyr from 52 for 5)::integer as m8,

substring(codeyr from 57 for 5)::integer as m9,

substring(codeyr from 62 for 5)::integer as m10,

substring(codeyr from 67 for 5)::integer as m11,

substring(codeyr from 72 for 5)::integer as m12

from precip

;

–check first 12 rows, looks ok

select * from newprecip limit 12;

code|duplicates|year|m1|m2|m3|m4|m5|m6|m7|m8|m9|m10|m11|m12|gid
20743086004|1|1962|0|0|0|833|0|663|3005|1658|2470|419|548|786|1
20743086004|1|1963|0|0|0|-9999|-9999|-9999|1061|2422|-9999|736|0|0|2
20743086004|1|1964|0|0|0|0|0|971|-9999|1963|-9999|86|20|0|3
20743086004|1|1965|0|0|38|84|0|450|2039|1397|1896|0|0|0|4
20743086004|1|1966|1194|0|30|73|128|1011|5265|2544|3710|981|780|240|5
20743086004|1|1967|60|0|380|0|0|2020|3310|2161|1641|0|0|887|6
20743086004|1|1968|340|150|10|30|100|1046|3319|893|2096|729|70|60|7
20743086004|1|1969|330|0|30|0|270|1100|2209|1647|3933|140|150|160|8
20743086004|1|1970|215|0|-9999|40|520|2014|1325|1734|1748|116|0|0|9
20743086005|1|1951|0|0|610|38|259|1473|2974|2136|1186|424|0|0|10
20743086005|1|1952|155|0|10|107|102|889|2532|1994|2624|396|0|0|11
20743086005|1|1953|165|0|0|356|0|4389|3406|5570|3129|922|0|0|12

– add primary key note that it needs to be unique so i have to add a serial first

alter table newprecip add column gid serial not null;

create unique index uidx_newprecip_gid on newprecip using btree(gid);

alter table newprecip add constraint

pkey__newprecip_code PRIMARY KEY(gid);

–try out some select queries like extracting data for Masaka, Uganda

select newprecip.*, ghcn_stations.name, ghcn_stations.code

from

ghcn_stations join newprecip

on ghcn_stations.code = newprecip.code

where ghcn_stations.country = ‘UGANDA’

order by name;

–lets try and extract the min and max year of available data

select ghcn_stations.name,min(newprecip.year) as minyr, max(newprecip.year) as maxyr

from newprecip, ghcn_stations

where ghcn_stations.country = ‘UGANDA’

and newprecip.code = ghcn_stations.code

group by ghcn_stations.name

order by maxyr DESC, name;

– but MASAKA in Uganda does not appear in this list…why?

– maybe the codes in the 2 tables that i use to join above dont match?

select ghcn_stations.code as code1

from ghcn_stations

where ghcn_stations.name= ‘MASAKA’;

select * from ghcn_stations

where name = ‘MASAKA’;

– both of the above find Masaka

– BUT the country name is missing for MASAKA in the ghcn_stations table

– so lets add it

update ghcn_stations

set country = ‘UGANDA’ where code = 15363705001;

VACUUM ANALYZE;

–now repeat earlier query

–lets try and extract the min and max year of available data

select ghcn_stations.name,min(newprecip.year) as minyr, max(newprecip.year) as maxyr

from newprecip, ghcn_stations

where ghcn_stations.country = ‘UGANDA’

and newprecip.code = ghcn_stations.code

group by ghcn_stations.name

order by name;

name|minyr|maxyr
ADUKA|1940|1985
AGORO|1940|1983
ARUA|1940|1992
BUHUKA|1940|1978
BULISA|1940|1985
BUNDIBUGYO|1940|1975
BUNYARUGURU|1940|1975
BUSENYI|1940|1985
BUSIKA|1940|1985
BUTIABA|1904|1979
BUTITI|1940|1985
BWIJANGA|1940|1985
ENTEBBE AIRPORT|1896|2000
FT.PORTAL/SEBUTOLE|1903|1980
GULU|1911|2000
IHUNGU|1940|1984
JINJA|1902|1992
KABALE|1917|1996
KABERAMAINDO|1940|1985
KAKOGE|1940|1975
KALANGALA|1926|1977
KAMPALA|1912|1992
KAPIRI|1940|1992
KASESE|1972|1992
KATAKWI|1927|1984
KAZIBA|1940|1975
KISOMORO|1940|1976
KITGUM V.T.C|1914|1995
KITWE|1940|1975
KOTIDO|1940|1985
KYENJOJO|1940|1982
LIRA NGETTA|1940|1992
LUGANZI|1940|1982
MAGYO|1940|1985
MASAFU|1940|1983
MASAKA|1902|1945
MASINDI|1940|1992
MATIRI|1940|1985
MATUNGA|1940|1992
MBALE|1908|1996
MBARARA|1903|1996
MITYAMA|1940|1985
MOYO|1940|1980
MPUGWE|1940|1985
MUKONO|1940|1985
MUTUNGA|1940|1985
NAMASAGALI/B.GOMBOLO|1915|1979
NAMULONGE|1961|1992
NAMWIWA|1940|1980
NGORA|1940|1982
NTWETWE|1940|1984
SERERE|1940|1985
TORORO|1961|1992
WOBULENZI|1940|1979

– YES! Masaka is there now. Note that Masaka has data only between 1902 and1945…

– Keep in mind that this was only a particular fix.

– There are many null values in the countries field of the original GHCN v2 stations table

– TO DO: these should ideally be filled by a spatial point-in-polygon process…

– …with the help of a countries (multi)polygon spatial table…

Advertisement

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Connecting to %s

Follow

Get every new post delivered to your Inbox.