--CREATE PROCEDURE [dbo].[strip_low_ascii]
--as
DECLARE @columns TABLE (id INT IDENTITY, column_name sysname)
DECLARE @odd_ascii CHAR(1),
@ascii_value int,
@column_start smallint,
@column_end SMALLINT,
@sql NVARCHAR(4000),
@field_name sysname, --to allow us to loop through all fields
@table_schema sysname,
@table_name sysname
SET @ascii_value = 31 --31 aka 0x1F. Space is 32/0x20. 0-30 is our "low-ascii" range
SET @table_schema = 'dbo'
SET @table_name = 'yourtablename'
CREATE TABLE #list_of_id (id bigint primary key)
INSERT INTO @columns
SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_SCHEMA = @table_schema
AND TABLE_NAME = @table_name
AND DATA_TYPE IN ('char','varchar','nchar','nvarchar')
SELECT @column_start = MIN(id), @column_end = MAX(id) FROM @columns
WHILE @ascii_value >= 0 --look for all low ascii
BEGIN
SET @odd_ascii = NULL
SET @column_start = 1
SELECT @odd_ascii = CHAR(@ascii_value)
WHILE @column_start <= @column_end
BEGIN
TRUNCATE TABLE #list_of_id
SET @field_name = NULL
SET @sql = NULL
SELECT @field_name = column_name FROM @columns WHERE id = @column_start
SELECT @sql = 'insert into #list_of_id
SELECT id FROM ' + QUOTENAME(@table_schema) + '.' + QUOTENAME(@table_name) + '
WHERE insert_datetime >=CONVERT(CHAR(8),GETDATE(),112)
AND ' + @field_name + ' LIKE ''%'' + @oddascii + ''%''
IF @@rowcount >0 --( select count from @list_of_id )
update ' + @table_name + ' set ' + @field_name + ' = REPLACE(' + @field_name + ', @oddascii, '''')
WHERE insert_datetime >=CONVERT(CHAR(8),GETDATE(),112)
and id in (select id from #list_of_id)
AND ' + @field_name + ' LIKE ''%'' + @oddascii + ''%'''
--PRINT @sql
EXECUTE sp_executesql @sql, N'@oddascii char(1)', @oddascii = @odd_ascii
SET @column_start = @column_start + 1
END
SET @ascii_value = @ascii_value-1
END
Thursday, February 17, 2011
[Code] Stripping low-ascii out of a table, slow way
I'm working on a CLR to do this considerably faster, but here's what I have for now. The purpose of this is to remove dirty data from an upstream feed. We can't make XML with it - the code that creates the XML chokes on it. Note that this isn't terribly fast, because we're brute-forcing it and having to loop through (31 * number-of-fields) times. The CLR should do it in one pass per field.
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment