Monday, September 5, 2011

Data Profiling Script

I recently needed to profile a database - pull out the schema, and also do some basic data quality checks. So, in the spirit of laziness, er automation, I wrote the following:

if object_Id('tempdb..#Output') is not null
drop table #output
go
Set Nocount on
go
declare @DataBase varchar(200) -- the name of the Database

Select @DataBase = 'Put Your Table Name here' -- Remember to enclose in quotes

if OBJECT_ID('tempdb..#Output') is not null
drop table #Output

select SO.name as TableName
,SC.name as ColumnName
,ST.name as DataType
,SC.length as DatTypePrecision
,CONVERT(int,0) as [RowCount]
,CONVERT(int,0) as DistinctRowCount
,CONVERT(int,0) as MinLength
,CONVERT(int,0) as MaxLength
,CONVERT(float,0) as AVGValue
,CONVERT(varchar(200),'') as MinValue
,CONVERT(varchar(200),'') as MaxValue
,CONVERT(bit,0) as isDateField
,CONVERT(bit,0) as isValidDate
into #Output
from sysObjects SO
join sysColumns SC
on SO.id = SC.id
join sys.Types ST
on SC.xType = ST.system_type_ID

where so.name = @DataBase

declare curLoop cursor
For
Select TableName
,ColumnName
,DataType
from #Output

Declare @TableName varchar(200)
,@Column Varchar(200)
,@DataType Varchar(200)
,@SQL varchar(max)
Open CurLoop

Fetch Next From curLoop
into @TableName
,@Column
,@DataType


While @@FETCH_STATUS = 0
begin

Select @SQL = 'update #Output set [RowCount]= (select count([' + @Column + ']) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)

Select @SQL = 'update #Output set [DistinctRowCount]= (select count(distinct [' + @Column + ']) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)

Select @SQL = 'update #Output set [minLength]= (select min(len([' + @Column + '])) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)

Select @SQL = 'update #Output set [MaxLength]= (select max(len([' + @Column + '])) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)

Select @SQL = 'update #Output set [AVGValue]= (select AVG([' + @Column + ']) from [' + @TableName + ']) where [DataType] in (''int'',''float'') and [ColumnName]=''' + @Column +''''
BEGIN TRY
exec (@SQL)
END TRY
BEGIN CATCH
Select @SQL = 'update #Output set [AVGValue]= 0 where [ColumnName]=''' + @Column +''''
END CATCH

Select @SQL = 'update #Output set [MinValue]= (select min(convert(varchar(max),[' + @Column + '])) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)

Select @SQL = 'update #Output set [MaxValue]= (select max(convert(varchar(max),[' + @Column + '])) from [' + @TableName + ']) where [ColumnName]=''' + @Column +''''
exec (@SQL)


Select @SQL = 'update #Output set [isDateField]= case when columnname like (''%Date%'') then 1 else 0 end where [ColumnName]=''' + @Column +''''
exec (@SQL)
--print @SQL

Select @SQL = 'update #Output set [isValidDate]= 1 where [ColumnName]=''' + @Column +'''and [isDateField]=1 and IsDate([MinValue]) = 1 and IsDate([MaxValue]) = 1'
exec (@SQL)


Fetch Next From curLoop
into @TableName
,@Column
,@DataType

end

close curLoop
Deallocate curloop





select *
from #Output