Always-On
Monitoring
The below three error code which need
to monitor for Always-on
Query:
SELECT * FROM SYS.SYSMESSAGES WHERE MSGLANGID=1033 AND DESCRIPTION LIKE '%AVAILABILITY%' AND ERROR IN (1480, 35264, 35265)
error
|
Description
|
1480
|
The %S_MSG database
"%.*ls" is changing roles from "%ls" to "%ls"
because the mirroring session or availability group failed over due to %S_MSG. This is an informational message only.
No user action is required.
|
35264
|
Always On Availability Groups data
movement for database '%.*ls' has been suspended
for the following reason: "%S_MSG" (Source ID %d; Source string:
'%.*ls'). To resume data movement on the database, you will need to resume
the database manually. For info
|
35265
|
Always On Availability Groups data
movement for database '%.*ls' has been resumed.
This is an informational message only. No user action is required.
|
Note: As we
do not have DB mail option so that we need to use Datadog alerting.
Let’s begin….
Error:
1480: The Always on Failover occurred. The below steps need to perform on
Primary and DR server.
Step 1: Create an Alert in SQL Server to
capture the error event.
USE [msdb]
GO
EXEC msdb.dbo.sp_add_alert @name=N'AG_FailOver',
@message_id=1480,
@severity=0,
@enabled=1,
@delay_between_responses=0,
@include_event_description_in=0,
@category_name=N'[Uncategorized]'--,
--@job_id=N'6f93aabb-a0ba-4e57-8695-1febaebe6c7c'
GO
Step 2: Create a Job in SQL server to
send the error details to table.
USE [msdb]
GO
BEGIN TRANSACTION
DECLARE @ReturnCode INT
SELECT @ReturnCode = 0
/******
Object: JobCategory [[Uncategorized
(Local)]] Script Date: 11/12/2018
4:44:24 AM ******/
IF NOT EXISTS (SELECT name FROM msdb.dbo.syscategories WHERE name=N'[Uncategorized (Local)]'
AND category_class=1)
BEGIN
EXEC @ReturnCode = msdb.dbo.sp_add_category @class=N'JOB', @type=N'LOCAL', @name=N'[Uncategorized (Local)]'
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
END
DECLARE @jobId BINARY(16)
EXEC @ReturnCode = msdb.dbo.sp_add_job @job_name=N'AG_FailOver_Job',
@enabled=1,
@notify_level_eventlog=0,
@notify_level_email=0,
@notify_level_netsend=0,
@notify_level_page=0,
@delete_level=0,
@description=N'No description available.',
@category_name=N'[Uncategorized (Local)]',
@owner_login_name=N'sa', @job_id = @jobId OUTPUT
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
/******
Object: Step [AG_FailOver] Script Date: 11/12/2018 4:44:24 AM ******/
EXEC @ReturnCode = msdb.dbo.sp_add_jobstep @job_id=@jobId, @step_name=N'AG_FailOver',
@step_id=1,
@cmdexec_success_code=0,
@on_success_action=1,
@on_success_step_id=0,
@on_fail_action=2,
@on_fail_step_id=0,
@retry_attempts=0,
@retry_interval=0,
@os_run_priority=0, @subsystem=N'TSQL',
@command=N'Insert into
[CSMonitor].[dbo].[AG_ERROR_FailOver] (Error_code,Error_desc,date)
values(1480,''AG Failover'',getdate())',
@database_name=N'master',
@flags=0
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_update_job @job_id = @jobId, @start_step_id = 1
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_add_jobserver @job_id = @jobId, @server_name = N'(local)'
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
COMMIT TRANSACTION
GOTO EndSave
QuitWithRollback:
IF (@@TRANCOUNT > 0) ROLLBACK TRANSACTION
EndSave:
GO
Step 3: Create a table in CSMonitor
database to log the error details.
USE [CSMonitor]
GO
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[AG_ERROR_FailOver](
[Id] [int] IDENTITY(1,1) NOT NULL,
[Error_code] [int] NULL,
[Error_desc] [varchar](max) NULL,
[Date] [datetime] NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO
Step 4: Open
the property of the created Alert (AG_FailOver), Click on Response Ã
check the Execute Job check
box Ã
Select the create job(AG_FailOver_Job)
from drop down list and Press OK
button.
Error:
35264: The Data Moment suspended. The below steps need to perform on
Primary and DR server.
Step 1: Create an Alert in SQL Server
to capture the error event.
USE [msdb]
GO
EXEC msdb.dbo.sp_add_alert @name=N'AG_Data_Moment_Suspended',
@message_id=35264,
@severity=0,
@enabled=1,
@delay_between_responses=0,
@include_event_description_in=0,
@category_name=N'[Uncategorized]'--,
--@job_id=N'6f93aabb-a0ba-4e57-8695-1febaebe6c7c'
GO
Step 2: Create a Job in SQL server to
send the error details to table.
USE [msdb]
GO
BEGIN TRANSACTION
DECLARE @ReturnCode INT
SELECT @ReturnCode = 0
/******
Object: JobCategory [[Uncategorized
(Local)]] Script Date: 11/12/2018
4:44:24 AM ******/
IF NOT EXISTS (SELECT name FROM msdb.dbo.syscategories WHERE name=N'[Uncategorized (Local)]'
AND category_class=1)
BEGIN
EXEC @ReturnCode = msdb.dbo.sp_add_category @class=N'JOB', @type=N'LOCAL', @name=N'[Uncategorized (Local)]'
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
END
DECLARE @jobId BINARY(16)
EXEC @ReturnCode = msdb.dbo.sp_add_job @job_name=N'AG_Data_Movment_Suspended_Job',
@enabled=1,
@notify_level_eventlog=0,
@notify_level_email=0,
@notify_level_netsend=0,
@notify_level_page=0,
@delete_level=0,
@description=N'No description available.',
@category_name=N'[Uncategorized (Local)]',
@owner_login_name=N'sa', @job_id = @jobId OUTPUT
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
/******
Object: Step
[AG_Data_Movment_Suspended] Script
Date: 11/12/2018 4:44:24 AM ******/
EXEC @ReturnCode = msdb.dbo.sp_add_jobstep @job_id=@jobId, @step_name=N'AG_Data_Movment_Suspended',
@step_id=1,
@cmdexec_success_code=0,
@on_success_action=1,
@on_success_step_id=0,
@on_fail_action=2,
@on_fail_step_id=0,
@retry_attempts=0,
@retry_interval=0,
@os_run_priority=0, @subsystem=N'TSQL',
@command=N'Insert into
[CSMonitor].[dbo].[AG_ERROR_Data_Moment_suspended] (Error_code,Error_desc,date)
values(35264,''Data Moment suspended'',getdate())',
@database_name=N'master',
@flags=0
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_update_job @job_id = @jobId, @start_step_id = 1
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_add_jobserver @job_id = @jobId, @server_name = N'(local)'
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
COMMIT TRANSACTION
GOTO EndSave
QuitWithRollback:
IF (@@TRANCOUNT > 0) ROLLBACK TRANSACTION
EndSave:
GO
Step 3: Create a table in CSMonitor
database to log the error details.
USE [CSMonitor]
GO
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[AG_ERROR_Data_Moment_suspended](
[Id] [int] IDENTITY(1,1) NOT NULL,
[Error_code] [int] NULL,
[Error_desc] [varchar](max) NULL,
[Date] [datetime] NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO
Step 4: Open the property of the created Alert (AG_Data_Moment_Suspended), Click on Response Ã
check the Execute Job check
box Ã
Select the create job() from drop down list and Press OK button.
Error:
35265: The Data Moment resumed. The below steps need to perform on
Primary and DR server.
Step 1: Create an Alert in SQL Server
to capture the error event.
USE [msdb]
GO
EXEC msdb.dbo.sp_add_alert @name=N'AG_Data_Moment_Resumed',
@message_id=35265,
@severity=0,
@enabled=1,
@delay_between_responses=0,
@include_event_description_in=0,
@category_name=N'[Uncategorized]'--,
--@job_id=N'6f93aabb-a0ba-4e57-8695-1febaebe6c7c'
GO
Step 2: Create a Job in SQL server to
send the error details to table.
USE [msdb]
GO
BEGIN TRANSACTION
DECLARE @ReturnCode INT
SELECT @ReturnCode = 0
/******
Object: JobCategory [[Uncategorized
(Local)]] Script Date: 11/12/2018
4:44:24 AM ******/
IF NOT EXISTS (SELECT name FROM msdb.dbo.syscategories WHERE name=N'[Uncategorized (Local)]'
AND category_class=1)
BEGIN
EXEC @ReturnCode = msdb.dbo.sp_add_category @class=N'JOB', @type=N'LOCAL', @name=N'[Uncategorized (Local)]'
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
END
DECLARE @jobId BINARY(16)
EXEC @ReturnCode = msdb.dbo.sp_add_job @job_name=N'AG_Data_Movment_Resumed_Job',
@enabled=1,
@notify_level_eventlog=0,
@notify_level_email=0,
@notify_level_netsend=0,
@notify_level_page=0,
@delete_level=0,
@description=N'No description available.',
@category_name=N'[Uncategorized (Local)]',
@owner_login_name=N'sa', @job_id = @jobId OUTPUT
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
/******
Object: Step [AG_Data_Movment_Resumed] Script Date: 11/12/2018 4:44:24 AM ******/
EXEC @ReturnCode = msdb.dbo.sp_add_jobstep @job_id=@jobId, @step_name=N'AG_Data_Movment_Resumed',
@step_id=1,
@cmdexec_success_code=0,
@on_success_action=1,
@on_success_step_id=0,
@on_fail_action=2,
@on_fail_step_id=0,
@retry_attempts=0,
@retry_interval=0,
@os_run_priority=0, @subsystem=N'TSQL',
@command=N'Insert into
[CSMonitor].[dbo].[AG_ERROR_Data_Moment_Resumed] (Error_code,Error_desc,date)
values(35265,''Data Moment Resumed'',getdate())',
@database_name=N'master',
@flags=0
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_update_job @job_id = @jobId, @start_step_id = 1
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
EXEC @ReturnCode = msdb.dbo.sp_add_jobserver @job_id = @jobId, @server_name = N'(local)'
IF
(@@ERROR <> 0 OR @ReturnCode <> 0) GOTO QuitWithRollback
COMMIT TRANSACTION
GOTO EndSave
QuitWithRollback:
IF (@@TRANCOUNT > 0) ROLLBACK TRANSACTION
EndSave:
GO
Step 3: Create a table in CSMonitor
database to log the error details.
USE [CSMonitor]
GO
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
CREATE TABLE [dbo].[AG_ERROR_Data_Moment_Resumed](
[Id] [int] IDENTITY(1,1) NOT NULL,
[Error_code] [int] NULL,
[Error_desc] [varchar](max) NULL,
[Date] [datetime] NULL
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO
Step 4: Open the property of the created Alert (AG_Data_Moment_Resumed),
Click on Response Ã
check the Execute Job check
box Ã
Select the create job(AG_Data_Moment_Resumed_Job)
from drop down list and Press OK
button.
·
Modify
DataDog stored procedure to add the script for Datadog agent. Copy below SP and
execute on Primary and DR server.
USE [msdb]
GO
/******
Object: StoredProcedure
[dbo].[SP_datadogtest] Script Date:
11/12/2018 5:20:16 AM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
ALTER PROCEDURE [dbo].[SP_datadogtest]
AS
BEGIN
-- SET
NOCOUNT ON added to prevent extra result sets from
--
interfering with SELECT statements.
SET NOCOUNT ON;
declare @AA int
declare @bb int
declare @cc int
declare @dd int
declare @ee int
declare @ff int
declare @gg int
declare @hh int
declare @ii int
declare @jj int
declare @kk int
declare @ll int
DECLARE @DBCOUNT INT
--
If the job got failed, success or cancel by anyone.
set @aa=( SELECt COUNT(*) FROM MSDB.DBO.SYSJOBSERVERS SJS
LEFT OUTER JOIN MSDB.DBO.SYSJOBS SJ ON (SJ.JOB_ID = SJS.JOB_ID)
where SJS.LAST_RUN_OUTCOME in (0,2) and enabled =1
and convert(varchar(10), last_run_date, 126) = (convert(varchar(10), GETDATE(), 112) )) -- 0 failed 2 cannaclled
--
If the log file got full more than 80 percente
create table #TmpLOGSPACE(
DatabaseName varchar(100)
, LOGSIZE_MB decimal(18, 9)
, LOGSPACE_USED decimal(18, 9)
, LOGSTATUS decimal(18, 9))
insert #TmpLOGSPACE(DatabaseName, LOGSIZE_MB, LOGSPACE_USED, LOGSTATUS)
exec
('DBCC
SQLPERF(LOGSPACE);')
set @bb= (select count(*) from #TmpLOGSPACE where DatabaseName not in ('Model','master','msdb')
and LOGSPACE_USED > 90) -- This is value which can
be change as per our requirement.
--
Drop table #TmpLOGSPACE
--
If the blocking is more than 10 minutes
set @cc=(SELECT count (*) FROM SYS.DM_EXEC_REQUESTS REQ
CROSS APPLY SYS.DM_EXEC_SQL_TEXT(SQL_HANDLE) AS SQLTEXT
WHERE TOTAL_ELAPSED_TIME >=600000 -- 10 MINUTES
and blocking_session_id <>0)
--
If query running more than 1 hour
set @dd=(SELECT count(*) FROM SYS.DM_EXEC_REQUESTS REQ
CROSS APPLY SYS.DM_EXEC_SQL_TEXT(SQL_HANDLE) AS SQLTEXT
WHERE TOTAL_ELAPSED_TIME >=3600000) -- 1 hour
--
If DB status is not online
set @ee =( select COUNT(*) FROM sys.databases where state_desc in ('Offline','EMERGENCY', 'SUSPECT'))
--
If any login created on instance level
set @gg=( select count(*) from sys.syslogins nolock
where convert(varchar(10), createdate, 126) >= convert(varchar(10), GETDATE(), 126) )
--If
backup not done since 1 day
SET @HH = '0'
--
If job got modified by anyone.
set @ii =(select count(*) from MSDB.DBO.SYSJOBS nolock where convert(varchar(10), date_modified, 126) >= convert(varchar(10), GETDATE(), 126) )
--declare
@timeTORUN int
--set
@timeTORUN =(select SUBSTRING ( CONVERT(VARCHAR(8),GETDATE(),108),1,2))
--if
(@timeTORUN =2 or @timeTORUN =4 or @timeTORUN =6 or @timeTORUN =8 or @timeTORUN
=12 or @timeTORUN =14 or @timeTORUN =16 or @timeTORUN =18 or @timeTORUN =20 or
@timeTORUN =22 or @timeTORUN =24)
--begin
--if
database created in last one day
create table #Tmpdatabasecreation(
databaseName varchar(100)
, db_size nvarchar(100)
, [owner] varchar(50)
, [dbid] int
, created date
, [status] varchar(2000)
,compatiblity_level int )
insert #Tmpdatabasecreation(databaseName, db_size, [owner], [dbid],created,[status],compatiblity_level)
exec
('Sp_helpdb')
set @ff= (select count(*) from #Tmpdatabasecreation where DatabaseName not in ('tempdb')
and created >= convert(varchar(10), GETDATE(), 126) )
--
Drop table #Tmpdatabasecreation
-- If
AG data moment suspended
set @jj= (select count(*) from CSMonitor.[dbo].[AG_ERROR_Data_Moment_suspended] where [Date] >= convert(varchar(10), GETDATE(), 126) )
-- If
AG data moment Resumed
set @kk= (select count(*) from CSMonitor.[dbo].[AG_ERROR_Data_Moment_Resumed] where [Date] >= convert(varchar(10), GETDATE(), 126) )
-- If
AG Fail over
set @ll= (select count(*) from CSMonitor.[dbo].[AG_ERROR_FailOver] where [Date] >= convert(varchar(10), GETDATE(), 126) )
--end
CREATE TABLE #Datadog (
[metric]
varchar(255) not null,
[type]
varchar(50) not null,
[value]
float not null,
[tags]
varchar(255))
insert into #Datadog values
('sqlserver.Agent.failedJob','gauge',@aa,'tag:FailedJob'),
('sqlserver.DB.LogSpace','gauge',@bb,'tag:LogSpace'),
('sqlserver.DB.Blocking','gauge',@cc,'tag:Blocking'),
('sqlserver.DB.LRQ','gauge',@dd,'tag:LRQ'),
('Sqlserver.DB.DBstatus','gauge',@ee,'tag:DBstatus'),
('Sqlserver.DB.NewDBCreated','gauge',@ff,'tag:NewDBCreated'),
('Sqlserver.DB.NewLoginCreated','gauge',@gg,'tag:NewLoginCreated'),
('Sqlserver.DB.BackupStatus','gauge',@hh,'tag:BackupStatus'),
('Sqlserver.DB.JobModified','gauge',@ii,'tag:JobModified'),
('Sqlserver.DB.Data_Moment_suspended','gauge',@jj,'tag:Data_Moment_suspended'),
('Sqlserver.DB.Data_Moment_Resumed','gauge',@kk,'tag:Data_Moment_Resumed'),
('Sqlserver.DB.FailOver','gauge',@ll,'tag:FailOver')
select * from #Datadog
END
Created
a matrix in Datadog console to send the alert notification mail to DBA team.
Hi there,
ReplyDeletePlease help.me to provide me the steps to 'How to Create a matrix in Datadog console to send the alert notification mail to DBA team'.
Thanks,
PavanKumar B