chapter 6


Data Storage, Searching and Manipulation

A schema is a collection of tables and functions in a database. There is no single schema that will satisfy the needs of every chemical database user. It might be possible to use an existing schema, perhaps one from this book, and modify it to suit the needs of a particular project. It might be necessary to examine the needs of the project and develop an entirely new schema. The purpose of this chapter is to give examples of useful schemas and to provide enough background to allow design of new schemas.


In Chapter 2, the usefulness of relational tables was introduced. Sample data from the US EPA was used to show the advantage of storing each type of data in a separate table. The data in each table remains related to the proper chemical compound through the use of a unique chemical id which functions as a unique key relating multiple tables. This will be used extensively in this and following chapters. The separation of data into multiple tables also facilitates cases where a compound may have multiple data values, also known as one-to-many relationships. This chapter will also show examples of how many-to-many relationships are handled. It will also show more examples of how the choice of data types affects the operation of the database and the applications that use it. [view wordle] [hide wordle]

Page  
54
Create table pubchem.nci_h23 ( 
   "sid"                  Integer, 
   "ext_datasource_regid" Integer,        
   "cid"                  Integer, 
   "activity_outcome"     Integer,        
   "activity_score"       Integer, 
   "activity_url"         Text,        
   "assaydata_comment"    Text, 
   "assaydata_revoke"     Text, 
   "log_gi50_M"           Numeric, 
   "log_gi50_ugml"        Numeric, 
   "log_gi50_v"           Numeric, 
   "indngi50"             Integer, 
   "stddevgi50"           Numeric, 
   "logtgi_m"             Numeric, 
   "logtgi_ugml"          Numeric, 
   "logtgi_v"             Numeric, 
   "indntgi"              Integer, 
   "stddevtgi"            Numeric        
  );
Comment on table pubchem.nci_h23 is
 'Growth inhibition of the NCI_H23 human Non-Small Cell Lung tumor cell line is measured as a screen
 for anti-cancer activity. Cells are grown in 96 well plates and exposed to the test compound for 48 hours.
 Compounds are tested at 5 different concentrations and three endpoints are estimated from this
 dose response curve: GI50, concentration required for 50% inhibition of growth, TGI,
 the concentration requires for complete inhibition of growth, and LC50, the concentration
 required for 50% reduction in cell number. These estimates are done by simple linear interpolation
 between the concentrations that surround the appropriate level.
 If a compound doesn’’t cause inhibition to the appropriate level, the endpoint is set to the highest
 concentration tested.'; 
Comment on column pubchem.nci_h23."sid" is 'PubChem Substance ID SID'; 
Comment on column pubchem.nci_h23."ext_datasource_regid" is
 'Space holder for PubChem external datasource RegID, empty by default'; 
Comment on column pubchem.nci_h23."cid" is 'Pubchem Compound ID';      
55
Copy pubchem.nci_h23 From Stdin Delimiter ',';
56
Select sid, activity_outcome, "log_gi50_M", log_gi50_ugml
       from nci_h23 where activity_outcome = 2;
Create Table substance ( 
       Title                 text, 
       BONDANNOTATIONS       text, 
       CID_ASSOCIATIONS      text, 
       COMPOUND_ID_TYPE      integer, 
       EXT_DATASOURCE_NAME   text, 
       EXT_DATASOURCE_REGID  text, 
       EXT_DATASOURCE_URL    text, 
       EXT_SUBSTANCE_URL     text, 
       GENBANK_NUCLEOTIDE_ID text, 
       GENBANK_PROTEIN_ID    text, 
       GENERIC_REGISTRY_NAME text, 
       PUBMED_ID             text, 
       SUBSTANCE_COMMENT     text, 
       SUBSTANCE_ID          integer, 
       SUBSTANCE_SYNONYM     text, 
       SUBSTANCE_VERSION     integer, 
       TOTAL_CHARGE          integer, 
       XREF_EXT_ID           text); 
Copy substance ( 
       Title, 
       BONDANNOTATIONS, 
       CID_ASSOCIATIONS, 
       COMPOUND_ID_TYPE, 
       EXT_DATASOURCE_NAME, 
       EXT_DATASOURCE_REGID, 
       EXT_DATASOURCE_URL, 
       EXT_SUBSTANCE_URL, 
       GENBANK_NUCLEOTIDE_ID, 
       GENBANK_PROTEIN_ID, 
       GENERIC_REGISTRY_NAME, 
       PUBMED_ID, 
       SUBSTANCE_COMMENT, 
       SUBSTANCE_ID, 
       SUBSTANCE_SYNONYM, 
       SUBSTANCE_VERSION, 
       TOTAL_CHARGE, 
       XREF_EXT_ID) 
from stdin delimiter ','; 
1,\N,449635 1,0,MOLI,MOLI000002,\N,\N,\N,\N,\N,\N,MOLI - NCI Molecular Imaging Agents\nFGCV,1,MOLI000002,1,0,MOLI000002 
2,2 11 5\n20 34 5\n25 31 6\n28 55 5\n5 13 6\n58 59 6\n8 33 5,\N,0,MOLI,MOLI000003,\N,\N,\N,\N,\N,\N,MOLI - NCI Molecular Imaging Agents\n[99mTc]-P2S2-BBN(7-14),2,MOLI000003,1,0,MOLI000003
57
Select
   sid, ext_datasource_name, substance.ext_datasource_regid,
   activity_outcome, "log_gi50_M", log_gi50_ugml
From
   pubchem.nci_h23
   Join pubchem.substance On substance.substance_id = nci_h23.sid
Where activity_outcome = 2;
58
Select * from pubchem.compound where iupac_name like '%aldehyde%' and heavy_atom_count < 20;
60
Select
   n.sid, sc.compound_id, sc.compound_type, c.openeye_can_smiles,
   s.ext_datasource_name, s.ext_datasource_regid,
   n.activity_outcome, n."log_gi50_M", n.log_gi50_ugml
From
   compound c
   Join substance_compound sc On sc.compound_id = c.cid
   Join substance s On s.substance_id = sc.substance_id
   Join nci_h23 n On n.sid = s.substance_id
Where activity_outcome = 2
Order By sid;
62
Alter Table compound Add Constraint mw_check Check (mw > 0);
Create Function valid_cas(integer, integer, integer) Returns boolean
  As 'Select $1 < 1000000 And $2 < 100 And $3 < 10 And
  $3 = (
  ($2 % 10 * 1) +
  ($2 / 10 * 2) +
  ($1 % 10 / 1 * 3 ) +
  ($1 % 100 / 10 * 4 ) +
  ($1 % 1000 / 100 * 5 ) +
  ($1 % 10000 / 1000 * 6 ) +
  ($1 % 100000 / 10000 * 7 ) +
  ($1 % 1000000 / 100000 * 8 )
  ) % 10;' Language SQL;
63
Alter Table compound Add Constraint cas_check
  Check (valid_cas(caspart(cas,1), caspart(cas,2), caspart(cas,3));
64
Select id,smiles,mw From atable Where mw < 500;
Select id,smiles,mw,logp From atable Where 
   (logp > 0 And mw < 500) Or
   (logp < 0 And mw < 580);
Select
   n.sid, n.activity_outcome, n."log_gi50_M", n.log_gi50_ugml
From
   nci_h23 n
   Where activity_outcome = 2
   Order By sid;
Select
   n.sid, n.activity_outcome, n."log_gi50_M", n.log_gi50_ugml,
   s.ext_datasource_name, s.ext_datasource_regid
From
   nci_h23 n
   Join substance s On s.substance_id = n.sid
 Where activity_outcome = 2
 Order By sid;
65
Select
   n.sid, n.activity_outcome, n."log_gi50_M", n.log_gi50_ugml,
   s.ext_datasource_name, s.ext_datasource_regid,
   sc.compound_id, sc.compound_type
From
   nci_h23 n
   Join substance s On s.substance_id = n.sid
   Join substance_compound sc On sc.substance_id = s.substance_id
 Where activity_outcome = 2
 Order By sid;
Select
   n.sid, n.activity_outcome, n."log_gi50_M", n.log_gi50_ugml,
   s.ext_datasource_name, s.ext_datasource_regid,
   sc.compound_id, sc.compound_type,
   c.openeye_can_smiles
From
   nci_h23 n
   Join substance s On s.substance_id = n.sid
   Join substance_compound sc On sc.substance_id = s.substance_id
   Join compound c On c.cid = sc.compound_id
 Where activity_outcome = 2
 Order By sid;
66
Select amine.smiles As amines, acid.smiles As acids
  From nci.structure amine, nci.structure acid
Where matches(amine.smiles, 'C[N!H0!R][C;D4]')
And matches(acid.smiles, 'CC(=O)[OH]')
Limit 96;
Select amine.smiles As amines, acid.smiles As acids From
 (Select smiles From nci.structure
  Where matches(smiles, 'C[N!H0!R][C;D4]') Limit 8) amine,
 (Select smiles From nci.structure
  Where matches(smiles, 'CC(=O)[OH]') Limit 12) acid;
68
Select logp From properties Where md5(logp) > md5(logp+1);
Select logp From properties Where md5(logp+1) < md5(logp);
Create View test_set As Select smiles, logp From properties
  Where md5(logp) > md5(logp+1);
Create View training_set As Select smiles, logp From properties
 Where md5(logp+1) < md5(logp);
69
Create View nci_h23_set1 As Select
   n.sid, n.activity_outcome, n."log_gi50_M", n.log_gi50_ugml,
   s.ext_datasource_name, s.ext_datasource_regid,
   sc.compound_id, sc.compound_type,
   c.openeye_can_smiles
 From
   nci_h23 n
   Join substance s On s.substance_id = n.sid
   Join substance_compound sc On sc.substance_id = s.substance_id
   Join compound c On c.cid = sc.compound_id;
Select compound_id, "log_gi50_M" From nci_h23_set1
         Where activity_outcome = 2 Order By sid;