Thursday, 22 November 2018

Always-On Monitoring using Datadog




Always-On Monitoring


The below three error code which need to monitor for Always-on

Query:
SELECT * FROM SYS.SYSMESSAGES WHERE MSGLANGID=1033 AND DESCRIPTION LIKE '%AVAILABILITY%' AND ERROR IN (1480, 35264, 35265)

error
Description
1480
The %S_MSG database "%.*ls" is changing roles from "%ls" to "%ls" because the mirroring session or availability group failed over due to %S_MSG. This is an informational message only. No user action is required.
35264
Always On Availability Groups data movement for database '%.*ls' has been suspended for the following reason: "%S_MSG" (Source ID %d; Source string: '%.*ls'). To resume data movement on the database, you will need to resume the database manually. For info
35265
Always On Availability Groups data movement for database '%.*ls' has been resumed. This is an informational message only. No user action is required.

Note: As we do not have DB mail option so that we need to use Datadog alerting.
Let’s begin….
Error: 1480: The Always on Failover occurred. The below steps need to perform on Primary and DR server.
Step 1: Create an Alert in SQL Server to capture the error event.
USE [msdb]
GO
EXEC msdb.dbo.sp_add_alert @name=N'AG_FailOver',
              @message_id=1480,
              @severity=0,
              @enabled=1,
              @delay_between_responses=0,
              @include_event_description_in=0,
              @category_name=N'[Uncategorized]'--,
              --@job_id=N'6f93aabb-a0ba-4e57-8695-1febaebe6c7c'
GO
Step 2: Create a Job in SQL server to send the error details to table.
USE [msdb]
GO

BEGIN TRANSACTION
DECLARE @ReturnCode INT
SELECT @ReturnCode = 0
/****** Object:  JobCategory [[Uncategorized (Local)]]    Script Date: 11/12/2018 4:44:24 AM ******/
IF NOT EXISTS (SELECT name FROM msdb.dbo.syscategories WHERE name=N'[Uncategorized (Local)]' AND category_class=1)
BEGIN
EXEC @ReturnCode = msdb.dbo.sp_add_category @class=N'JOB', @type=N'LOCAL', @name=N'[Uncategorized (Local)]'
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback

END

DECLARE @jobId BINARY(16)
EXEC @ReturnCode =  msdb.dbo.sp_add_job @job_name=N'AG_FailOver_Job',
              @enabled=1,
              @notify_level_eventlog=0,
              @notify_level_email=0,
              @notify_level_netsend=0,
              @notify_level_page=0,
              @delete_level=0,
              @description=N'No description available.',
              @category_name=N'[Uncategorized (Local)]',
              @owner_login_name=N'sa', @job_id = @jobId OUTPUT
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
/****** Object:  Step [AG_FailOver]    Script Date: 11/12/2018 4:44:24 AM ******/
EXEC @ReturnCode = msdb.dbo.sp_add_jobstep @job_id=@jobId, @step_name=N'AG_FailOver',
              @step_id=1,
              @cmdexec_success_code=0,
              @on_success_action=1,
              @on_success_step_id=0,
              @on_fail_action=2,
              @on_fail_step_id=0,
              @retry_attempts=0,
              @retry_interval=0,
              @os_run_priority=0, @subsystem=N'TSQL',
              @command=N'Insert into [CSMonitor].[dbo].[AG_ERROR_FailOver] (Error_code,Error_desc,date) values(1480,''AG Failover'',getdate())',
              @database_name=N'master',
              @flags=0
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_update_job @job_id = @jobId, @start_step_id = 1
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_add_jobserver @job_id = @jobId, @server_name = N'(local)'
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
COMMIT TRANSACTION
GOTO EndSave
QuitWithRollback:
    IF (@@TRANCOUNT > 0) ROLLBACK TRANSACTION
EndSave:
GO



Step 3: Create a table in CSMonitor database to log the error details.
USE [CSMonitor]
GO

SET ANSI_NULLS ON
GO

SET QUOTED_IDENTIFIER ON
GO

CREATE TABLE [dbo].[AG_ERROR_FailOver](
       [Id] [int] IDENTITY(1,1) NOT NULL,
       [Error_code] [int] NULL,
       [Error_desc] [varchar](max) NULL,
       [Date] [datetime] NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO

Step 4: Open the property of the created Alert (AG_FailOver), Click on Response à  check the Execute Job check box à Select the create job(AG_FailOver_Job) from drop down list and Press OK button.


Error: 35264: The Data Moment suspended. The below steps need to perform on Primary and DR server.
Step 1: Create an Alert in SQL Server to capture the error event.
USE [msdb]
GO

EXEC msdb.dbo.sp_add_alert @name=N'AG_Data_Moment_Suspended',
              @message_id=35264,
              @severity=0,
              @enabled=1,
              @delay_between_responses=0,
              @include_event_description_in=0,
              @category_name=N'[Uncategorized]'--,
              --@job_id=N'6f93aabb-a0ba-4e57-8695-1febaebe6c7c'
GO
Step 2: Create a Job in SQL server to send the error details to table.

USE [msdb]
GO

BEGIN TRANSACTION
DECLARE @ReturnCode INT
SELECT @ReturnCode = 0
/****** Object:  JobCategory [[Uncategorized (Local)]]    Script Date: 11/12/2018 4:44:24 AM ******/
IF NOT EXISTS (SELECT name FROM msdb.dbo.syscategories WHERE name=N'[Uncategorized (Local)]' AND category_class=1)
BEGIN
EXEC @ReturnCode = msdb.dbo.sp_add_category @class=N'JOB', @type=N'LOCAL', @name=N'[Uncategorized (Local)]'
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback

END

DECLARE @jobId BINARY(16)
EXEC @ReturnCode =  msdb.dbo.sp_add_job @job_name=N'AG_Data_Movment_Suspended_Job',
              @enabled=1,
              @notify_level_eventlog=0,
              @notify_level_email=0,
              @notify_level_netsend=0,
              @notify_level_page=0,
              @delete_level=0,
              @description=N'No description available.',
              @category_name=N'[Uncategorized (Local)]',
              @owner_login_name=N'sa', @job_id = @jobId OUTPUT
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
/****** Object:  Step [AG_Data_Movment_Suspended]    Script Date: 11/12/2018 4:44:24 AM ******/
EXEC @ReturnCode = msdb.dbo.sp_add_jobstep @job_id=@jobId, @step_name=N'AG_Data_Movment_Suspended',
              @step_id=1,
              @cmdexec_success_code=0,
              @on_success_action=1,
              @on_success_step_id=0,
              @on_fail_action=2,
              @on_fail_step_id=0,
              @retry_attempts=0,
              @retry_interval=0,
              @os_run_priority=0, @subsystem=N'TSQL',
              @command=N'Insert into [CSMonitor].[dbo].[AG_ERROR_Data_Moment_suspended] (Error_code,Error_desc,date) values(35264,''Data Moment suspended'',getdate())',
              @database_name=N'master',
              @flags=0
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_update_job @job_id = @jobId, @start_step_id = 1
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_add_jobserver @job_id = @jobId, @server_name = N'(local)'
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
COMMIT TRANSACTION
GOTO EndSave
QuitWithRollback:
    IF (@@TRANCOUNT > 0) ROLLBACK TRANSACTION
EndSave:
GO

Step 3: Create a table in CSMonitor database to log the error details.
USE [CSMonitor]
GO

SET ANSI_NULLS ON
GO

SET QUOTED_IDENTIFIER ON
GO

CREATE TABLE [dbo].[AG_ERROR_Data_Moment_suspended](
       [Id] [int] IDENTITY(1,1) NOT NULL,
       [Error_code] [int] NULL,
       [Error_desc] [varchar](max) NULL,
       [Date] [datetime] NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO


Step 4: Open the property of the created Alert (AG_Data_Moment_Suspended), Click on Response à  check the Execute Job check box à Select the create job() from drop down list and Press OK button.


Error: 35265: The Data Moment resumed. The below steps need to perform on Primary and DR server.
Step 1: Create an Alert in SQL Server to capture the error event.
USE [msdb]
GO

EXEC msdb.dbo.sp_add_alert @name=N'AG_Data_Moment_Resumed',
              @message_id=35265,
              @severity=0,
              @enabled=1,
              @delay_between_responses=0,
              @include_event_description_in=0,
              @category_name=N'[Uncategorized]'--,
              --@job_id=N'6f93aabb-a0ba-4e57-8695-1febaebe6c7c'
GO
Step 2: Create a Job in SQL server to send the error details to table.
USE [msdb]
GO

BEGIN TRANSACTION
DECLARE @ReturnCode INT
SELECT @ReturnCode = 0
/****** Object:  JobCategory [[Uncategorized (Local)]]    Script Date: 11/12/2018 4:44:24 AM ******/
IF NOT EXISTS (SELECT name FROM msdb.dbo.syscategories WHERE name=N'[Uncategorized (Local)]' AND category_class=1)
BEGIN
EXEC @ReturnCode = msdb.dbo.sp_add_category @class=N'JOB', @type=N'LOCAL', @name=N'[Uncategorized (Local)]'
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback

END

DECLARE @jobId BINARY(16)
EXEC @ReturnCode =  msdb.dbo.sp_add_job @job_name=N'AG_Data_Movment_Resumed_Job',
              @enabled=1,
              @notify_level_eventlog=0,
              @notify_level_email=0,
              @notify_level_netsend=0,
              @notify_level_page=0,
              @delete_level=0,
              @description=N'No description available.',
              @category_name=N'[Uncategorized (Local)]',
              @owner_login_name=N'sa', @job_id = @jobId OUTPUT
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
/****** Object:  Step [AG_Data_Movment_Resumed]    Script Date: 11/12/2018 4:44:24 AM ******/
EXEC @ReturnCode = msdb.dbo.sp_add_jobstep @job_id=@jobId, @step_name=N'AG_Data_Movment_Resumed',
              @step_id=1,
              @cmdexec_success_code=0,
              @on_success_action=1,
              @on_success_step_id=0,
              @on_fail_action=2,
              @on_fail_step_id=0,
              @retry_attempts=0,
              @retry_interval=0,
              @os_run_priority=0, @subsystem=N'TSQL',
              @command=N'Insert into [CSMonitor].[dbo].[AG_ERROR_Data_Moment_Resumed] (Error_code,Error_desc,date) values(35265,''Data Moment Resumed'',getdate())',
              @database_name=N'master',
              @flags=0
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_update_job @job_id = @jobId, @start_step_id = 1
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_add_jobserver @job_id = @jobId, @server_name = N'(local)'
IF (@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
COMMIT TRANSACTION
GOTO EndSave
QuitWithRollback:
    IF (@@TRANCOUNT > 0) ROLLBACK TRANSACTION
EndSave:
GO


Step 3: Create a table in CSMonitor database to log the error details.
USE [CSMonitor]
GO

SET ANSI_NULLS ON
GO

SET QUOTED_IDENTIFIER ON
GO

CREATE TABLE [dbo].[AG_ERROR_Data_Moment_Resumed](
       [Id] [int] IDENTITY(1,1) NOT NULL,
       [Error_code] [int] NULL,
       [Error_desc] [varchar](max) NULL,
       [Date] [datetime] NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO


Step 4: Open the property of the created Alert (AG_Data_Moment_Resumed), Click on Response à  check the Execute Job check box à Select the create job(AG_Data_Moment_Resumed_Job) from drop down list and Press OK button.




·         Modify DataDog stored procedure to add the script for Datadog agent. Copy below SP and execute on Primary and DR server.

USE [msdb]
GO
/****** Object:  StoredProcedure [dbo].[SP_datadogtest]    Script Date: 11/12/2018 5:20:16 AM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO




ALTER PROCEDURE [dbo].[SP_datadogtest]
AS
BEGIN
       -- SET NOCOUNT ON added to prevent extra result sets from
       -- interfering with SELECT statements.
       SET NOCOUNT ON;

declare @AA int
declare @bb int
declare @cc int
declare @dd int
declare @ee int
declare @ff int
declare @gg int
declare @hh int
declare @ii int

declare @jj int
declare @kk int
declare @ll int
DECLARE @DBCOUNT INT

-- If the job got failed, success or cancel by anyone.
set @aa=( SELECt COUNT(*) FROM MSDB.DBO.SYSJOBSERVERS SJS
LEFT OUTER JOIN MSDB.DBO.SYSJOBS SJ ON (SJ.JOB_ID = SJS.JOB_ID)
where SJS.LAST_RUN_OUTCOME in (0,2) and enabled =1
and convert(varchar(10), last_run_date, 126) = (convert(varchar(10), GETDATE(), 112) )) -- 0 failed 2 cannaclled

-- If the log file got full more than 80 percente
create table #TmpLOGSPACE(
  DatabaseName varchar(100)
  , LOGSIZE_MB decimal(18, 9)
  , LOGSPACE_USED decimal(18, 9)
  ,  LOGSTATUS decimal(18, 9))

insert #TmpLOGSPACE(DatabaseName, LOGSIZE_MB, LOGSPACE_USED, LOGSTATUS)
exec ('DBCC SQLPERF(LOGSPACE);')
set @bb= (select count(*) from #TmpLOGSPACE where DatabaseName not in ('Model','master','msdb')
and LOGSPACE_USED > 90) -- This is value which can be change as per our requirement.
-- Drop table #TmpLOGSPACE

-- If the blocking is more than 10 minutes

set @cc=(SELECT count (*) FROM SYS.DM_EXEC_REQUESTS REQ
CROSS APPLY SYS.DM_EXEC_SQL_TEXT(SQL_HANDLE) AS SQLTEXT
WHERE TOTAL_ELAPSED_TIME >=600000 -- 10 MINUTES
and blocking_session_id <>0)

-- If query running more than 1 hour

set @dd=(SELECT count(*) FROM SYS.DM_EXEC_REQUESTS REQ
CROSS APPLY SYS.DM_EXEC_SQL_TEXT(SQL_HANDLE) AS SQLTEXT
WHERE TOTAL_ELAPSED_TIME >=3600000) -- 1 hour

-- If DB status is not online
set @ee =( select COUNT(*) FROM sys.databases where state_desc in ('Offline','EMERGENCY', 'SUSPECT'))

-- If any login created on instance level
set @gg=( select count(*) from sys.syslogins nolock
where convert(varchar(10), createdate, 126)  >= convert(varchar(10), GETDATE(), 126) )

--If backup not done since 1 day
SET @HH = '0'
-- If job got modified by anyone.
set @ii =(select count(*) from  MSDB.DBO.SYSJOBS nolock where convert(varchar(10), date_modified, 126)  >= convert(varchar(10), GETDATE(), 126) )

--declare @timeTORUN int
--set @timeTORUN =(select SUBSTRING ( CONVERT(VARCHAR(8),GETDATE(),108),1,2))
--if (@timeTORUN =2 or @timeTORUN =4 or @timeTORUN =6 or @timeTORUN =8 or @timeTORUN =12 or @timeTORUN =14 or @timeTORUN =16 or @timeTORUN =18 or @timeTORUN =20 or @timeTORUN =22 or @timeTORUN =24)
--begin

--if database created in last one day
create table #Tmpdatabasecreation(
  databaseName varchar(100)
  , db_size nvarchar(100)
  , [owner] varchar(50)
  ,  [dbid] int
  , created date
  , [status] varchar(2000)
  ,compatiblity_level int )
insert #Tmpdatabasecreation(databaseName, db_size, [owner], [dbid],created,[status],compatiblity_level)
exec ('Sp_helpdb')
set @ff= (select count(*) from #Tmpdatabasecreation where DatabaseName not in ('tempdb')
and created >= convert(varchar(10), GETDATE(), 126) )
-- Drop table #Tmpdatabasecreation


 -- If AG data moment suspended
 set @jj= (select count(*) from CSMonitor.[dbo].[AG_ERROR_Data_Moment_suspended] where  [Date] >= convert(varchar(10), GETDATE(), 126) )

  -- If AG data moment Resumed
 set @kk= (select count(*) from CSMonitor.[dbo].[AG_ERROR_Data_Moment_Resumed] where  [Date] >= convert(varchar(10), GETDATE(), 126) )

  -- If AG Fail over
 set @ll= (select count(*) from CSMonitor.[dbo].[AG_ERROR_FailOver] where  [Date] >= convert(varchar(10), GETDATE(), 126) )

--end

CREATE TABLE #Datadog (
[metric] varchar(255) not null,
[type] varchar(50) not null,
[value] float not  null,
[tags] varchar(255))

insert into #Datadog values
('sqlserver.Agent.failedJob','gauge',@aa,'tag:FailedJob'),
('sqlserver.DB.LogSpace','gauge',@bb,'tag:LogSpace'),
('sqlserver.DB.Blocking','gauge',@cc,'tag:Blocking'),
('sqlserver.DB.LRQ','gauge',@dd,'tag:LRQ'),
('Sqlserver.DB.DBstatus','gauge',@ee,'tag:DBstatus'),
('Sqlserver.DB.NewDBCreated','gauge',@ff,'tag:NewDBCreated'),
('Sqlserver.DB.NewLoginCreated','gauge',@gg,'tag:NewLoginCreated'),
('Sqlserver.DB.BackupStatus','gauge',@hh,'tag:BackupStatus'),
('Sqlserver.DB.JobModified','gauge',@ii,'tag:JobModified'),
('Sqlserver.DB.Data_Moment_suspended','gauge',@jj,'tag:Data_Moment_suspended'),
('Sqlserver.DB.Data_Moment_Resumed','gauge',@kk,'tag:Data_Moment_Resumed'),
('Sqlserver.DB.FailOver','gauge',@ll,'tag:FailOver')
select * from #Datadog

END


Created a matrix in Datadog console to send the alert notification mail to DBA team.




2 comments:

  1. Hi there,
    Please help.me to provide me the steps to 'How to Create a matrix in Datadog console to send the alert notification mail to DBA team'.

    Thanks,
    PavanKumar B

    ReplyDelete
  2. Great and that i have a nifty give: How Much House Renovation Cost Philippines home remodeling companies

    ReplyDelete