Showing posts with label Data warehouse. Show all posts
Showing posts with label Data warehouse. Show all posts

Monday, September 5, 2011

Data Profiling Script

I recently needed to profile a database - pull out the schema, and also do some basic data quality checks. So, in the spirit of laziness, er automation, I wrote the following:

if object_Id('tempdb..#Output') is not null
drop table #output
go
Set Nocount on
go
declare @DataBase varchar(200) -- the name of the Database

Select @DataBase = 'Put Your Table Name here' -- Remember to enclose in quotes

if OBJECT_ID('tempdb..#Output') is not null
drop table #Output

select SO.name as TableName
,SC.name as ColumnName
,ST.name as DataType
,SC.length as DatTypePrecision
,CONVERT(int,0) as [RowCount]
,CONVERT(int,0) as DistinctRowCount
,CONVERT(int,0) as MinLength
,CONVERT(int,0) as MaxLength
,CONVERT(float,0) as AVGValue
,CONVERT(varchar(200),'') as MinValue
,CONVERT(varchar(200),'') as MaxValue
,CONVERT(bit,0) as isDateField
,CONVERT(bit,0) as isValidDate
into #Output
from sysObjects SO
join sysColumns SC
on SO.id = SC.id
join sys.Types ST
on SC.xType = ST.system_type_ID

where so.name = @DataBase

declare curLoop cursor
For
Select TableName
,ColumnName
,DataType
from #Output

Declare @TableName varchar(200)
,@Column Varchar(200)
,@DataType Varchar(200)
,@SQL varchar(max)
Open CurLoop

Fetch Next From curLoop
into @TableName
,@Column
,@DataType


While @@FETCH_STATUS = 0
begin

Select @SQL = 'update #Output set [RowCount]= (select count([' + @Column + ']) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)

Select @SQL = 'update #Output set [DistinctRowCount]= (select count(distinct [' + @Column + ']) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)

Select @SQL = 'update #Output set [minLength]= (select min(len([' + @Column + '])) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)

Select @SQL = 'update #Output set [MaxLength]= (select max(len([' + @Column + '])) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)

Select @SQL = 'update #Output set [AVGValue]= (select AVG([' + @Column + ']) from [' + @TableName + ']) where [DataType] in (''int'',''float'') and [ColumnName]=''' + @Column +''''
BEGIN TRY
exec (@SQL)
END TRY
BEGIN CATCH
Select @SQL = 'update #Output set [AVGValue]= 0 where [ColumnName]=''' + @Column +''''
END CATCH

Select @SQL = 'update #Output set [MinValue]= (select min(convert(varchar(max),[' + @Column + '])) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)

Select @SQL = 'update #Output set [MaxValue]= (select max(convert(varchar(max),[' + @Column + '])) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)


Select @SQL = 'update #Output set [isDateField]= case when columnname like (''%Date%'') then 1 else 0 end where [ColumnName]=''' + @Column +''''
exec (@SQL)
--print @SQL

Select @SQL = 'update #Output set [isValidDate]= 1 where [ColumnName]=''' + @Column +'''and [isDateField]=1 and IsDate([MinValue]) = 1 and IsDate([MaxValue]) = 1'
exec (@SQL)


Fetch Next From curLoop
into @TableName
,@Column
,@DataType

end

close curLoop
Deallocate curloop





select *
from #Output

Wednesday, May 7, 2008

Dynamic Info Matrix

Hi

I recently had to do an Info matrix for a DWH with no referential integrity - all that they had were neatly named tables and key columns (but no foreign keys). There were 45 Facts and 49 dimensions. Out of sheer laziness I decided to rather create it using code than manually go and fill in the ticks. so here's the code to do it (output it to text using SSMS, and paste into your spreadsheet tool of choice, split the text data and voila !!!)

if Object_ID ('tempdb..#IM') is not null
drop table #IM
if Object_ID ('tempdb..#MasterList') is not null
drop table #MasterList

if Object_ID ('tempdb..#Dim') is not null
drop table #Dim
Go
Set Nocount on
go

declare @FactTable_Convention varchar(200) -- How are fact tables named
,@DimSK_Convention varchar(200) -- How are dim_sk's indicated
,@Warehouse_DB varchar(200) -- what is the warehouse name
,@IM_UsageIndicator varchar(20) -- what value is used to indicate that
-- a table uses a dim

Select @FactTable_Convention = '%_fact'
,@DimSK_Convention = '%_id'
,@Warehouse_DB = 'Some_DWH'
,@IM_UsageIndicator = 'a' --Webdings makes this a tick mark

-- Internal variables
Declare @intRow int -- tracks the current row
,@intRows int -- tracks the number of rows
,@Query varchar(max)



Create table #MasterList
(
RowID int identity (1,1)
,FactName varchar(200)
,SKName varchar(200)
)


Select @Query = 'Insert into #MasterList (FactName, SKname)'
+ ' Select SO.Name, case when SC.Name like '
+ char(39) + '%Date%' + char(39)
+ 'then ' + char(39) + 'Date dimension' +char(39)
+ ' else SC.Name end '
+ 'from '
+ @WareHouse_DB
+ '..SysObjects SO join '
+ + @WareHouse_DB
+ '..sysColumns SC on SO.ID = SC.ID where SO.XType = '
+ char(39) + 'U' + char(39)
+ ' and SO.Name like '
+ char(39)
+ @FactTable_Convention
+ Char(39)
+ ' and (SC.Name like '
+ char(39)
+ replace(@DimSK_Convention,',',' or SC.Name like ')
+ Char(39) + ') and SC.TypeStat=0' --TypeStat indicates ID columns

--Print @Query
Exec (@Query)



Create table #IM
(
RowID int identity (1,1)
,FactName varchar(200)
,Fact varchar(200)
)

Insert into #IM (FactName,Fact)
Select distinct FactName,
Replace(FactName,replace(@FactTable_Convention,'%',''),'')
from #MasterList


Create table #Dim
(
DimID int identity (1,1)
,DimKeyName varchar(200)
,Dimension varchar(200)

)

Insert into #Dim
(
DimKeyName
,Dimension
)
Select distinct case
when SKName like '%Date%'
then 'Date dimension'
else SKName
end,
case
when Replace(SkName,replace(@DimSK_Convention,'%',''),'')like '%Date%'
then 'Date dimension'
else Replace(SkName,replace(@DimSK_Convention,'%',''),'')
end
from #MasterList

Select @intRow = min(DimID)
,@intRows = max(DimID)
from #Dim

While @intRow <= @intRows --iterate Dim columns
begin

select @Query = 'Alter table #IM add ['
+ Dimension
+ '] varchar(5)'
from #dim
Where DimID = @intRow

--Print @Query
exec (@Query)

-- now check in which tables it occurs
Select @Query = 'Update #IM set ['
+ Dimension
+ '] = '
+ char(39)
+ @IM_UsageIndicator
+ char(39)
+ ' from #IM IM join (select FactName from #masterlist where SKName= '
+ char(39) + DimKeyName + char(39)
+ ') List on IM.FactName = List.FactName'
from #Dim
Where DimID = @intRow

--Print @Query
exec (@Query)

Select @Query = 'Update #IM set ['
+ Dimension
+ '] = '
+ char(39) + char(39)
+ ' Where ['
+ Dimension
+ '] is null'
from #Dim
Where DimID = @intRow

--Print @Query
exec (@Query)

Select @intRow = min(DimID)
from #Dim
Where DimID > @intRow

End

Alter table #IM
drop Column RowID


Alter table #IM
drop Column FactName

select *
from #IM

----Print courtesy of http://my-pc-hates-me.blogspot.com