Thursday, February 17, 2011

[Code] Stripping low-ascii out of a table, slow way

I'm working on a CLR to do this considerably faster, but here's what I have for now. The purpose of this is to remove dirty data from an upstream feed. We can't make XML with it - the code that creates the XML chokes on it. Note that this isn't terribly fast, because we're brute-forcing it and having to loop through (31 * number-of-fields) times. The CLR should do it in one pass per field.


--CREATE PROCEDURE [dbo].[strip_low_ascii]
--as
DECLARE @columns TABLE (id INT IDENTITY, column_name sysname)
DECLARE @odd_ascii CHAR(1),
@ascii_value int,
@column_start smallint,
@column_end SMALLINT,
@sql NVARCHAR(4000),
@field_name sysname, --to allow us to loop through all fields
@table_schema sysname,
@table_name sysname
SET @ascii_value = 31 --31 aka 0x1F. Space is 32/0x20. 0-30 is our "low-ascii" range
SET @table_schema = 'dbo'
SET @table_name = 'yourtablename'

CREATE TABLE #list_of_id (id bigint primary key)

INSERT INTO @columns
SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_SCHEMA = @table_schema
AND TABLE_NAME = @table_name
AND DATA_TYPE IN ('char','varchar','nchar','nvarchar')

SELECT @column_start = MIN(id), @column_end = MAX(id) FROM @columns

WHILE @ascii_value >= 0 --look for all low ascii
BEGIN
SET @odd_ascii = NULL
SET @column_start = 1
SELECT @odd_ascii = CHAR(@ascii_value)

WHILE @column_start <= @column_end
BEGIN
TRUNCATE TABLE #list_of_id
SET @field_name = NULL
SET @sql = NULL
SELECT @field_name = column_name FROM @columns WHERE id = @column_start
SELECT @sql = 'insert into #list_of_id
SELECT id FROM ' + QUOTENAME(@table_schema) + '.' + QUOTENAME(@table_name) + '
WHERE insert_datetime >=CONVERT(CHAR(8),GETDATE(),112)
AND ' + @field_name + ' LIKE ''%'' + @oddascii + ''%''
IF @@rowcount >0 --( select count from @list_of_id )
update ' + @table_name + ' set ' + @field_name + ' = REPLACE(' + @field_name + ', @oddascii, '''')
WHERE insert_datetime >=CONVERT(CHAR(8),GETDATE(),112)
and id in (select id from #list_of_id)
AND ' + @field_name + ' LIKE ''%'' + @oddascii + ''%'''


--PRINT @sql

EXECUTE sp_executesql @sql, N'@oddascii char(1)', @oddascii = @odd_ascii

SET @column_start = @column_start + 1
END

SET @ascii_value = @ascii_value-1
END


No comments: